{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4001803697580039, "eval_steps": 500, "global_step": 6656, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05028445, "auxiliary_loss_mlp": 0.02215396, "balance_loss_clip": 2.43573999, "balance_loss_mlp": 1.76983953, "epoch": 6.012325266796934e-05, "flos": 24456507091200.0, "grad_norm": 55.00561300220404, "language_loss": 2.85272503, "learning_rate": 0.0, "loss": 1.94613922, "num_input_tokens_seen": 19155, "step": 1, "time_per_iteration": 18.059409618377686 }, { "auxiliary_loss_clip": 0.03380539, "auxiliary_loss_mlp": 0.01459449, "balance_loss_clip": 1.62786555, "balance_loss_mlp": 1.18936849, "epoch": 0.00012024650533593868, "flos": 20225931246720.0, "grad_norm": 34.93149751452764, "language_loss": 1.82606053, "learning_rate": 4.4628432569317594e-07, "loss": 1.87446034, "num_input_tokens_seen": 36175, "step": 2, "time_per_iteration": 2.6318798065185547 }, { "auxiliary_loss_clip": 0.03320229, "auxiliary_loss_mlp": 0.01440978, "balance_loss_clip": 1.62577581, "balance_loss_mlp": 1.18882656, "epoch": 0.000180369758003908, "flos": 22309935454080.0, "grad_norm": 32.71870482280511, "language_loss": 1.57573509, "learning_rate": 7.073439208833112e-07, "loss": 1.62334716, "num_input_tokens_seen": 54870, "step": 3, "time_per_iteration": 2.6362481117248535 }, { "auxiliary_loss_clip": 0.03361497, "auxiliary_loss_mlp": 0.01451404, "balance_loss_clip": 1.62418985, "balance_loss_mlp": 1.15500188, "epoch": 0.00024049301067187735, "flos": 22414650577920.0, "grad_norm": 51.2387172839747, "language_loss": 1.67362881, "learning_rate": 8.925686513863519e-07, "loss": 1.72175777, "num_input_tokens_seen": 74575, "step": 4, "time_per_iteration": 2.7070822715759277 }, { "auxiliary_loss_clip": 0.03402497, "auxiliary_loss_mlp": 0.01505358, "balance_loss_clip": 1.62493396, "balance_loss_mlp": 1.21715808, "epoch": 0.0003006162633398467, "flos": 21396978449280.0, "grad_norm": 56.088721215944275, "language_loss": 1.91627169, "learning_rate": 1.0362401141348472e-06, "loss": 1.96535027, "num_input_tokens_seen": 92580, "step": 5, "time_per_iteration": 2.91436767578125 }, { "auxiliary_loss_clip": 0.03370454, "auxiliary_loss_mlp": 0.01515599, "balance_loss_clip": 1.61556244, "balance_loss_mlp": 1.22110426, "epoch": 0.000360739516007816, "flos": 21652375127040.0, "grad_norm": 33.397169652317885, "language_loss": 1.60591149, "learning_rate": 1.153628246576487e-06, "loss": 1.65477204, "num_input_tokens_seen": 109705, "step": 6, "time_per_iteration": 2.994969367980957 }, { "auxiliary_loss_clip": 0.03354239, "auxiliary_loss_mlp": 0.01486417, "balance_loss_clip": 1.61577415, "balance_loss_mlp": 1.20336628, "epoch": 0.0004208627686757854, "flos": 27159742897920.0, "grad_norm": 24.6270766983672, "language_loss": 1.53276002, "learning_rate": 1.2528784983718962e-06, "loss": 1.58116663, "num_input_tokens_seen": 129425, "step": 7, "time_per_iteration": 3.0675876140594482 }, { "auxiliary_loss_clip": 0.03321216, "auxiliary_loss_mlp": 0.0144328, "balance_loss_clip": 1.61205018, "balance_loss_mlp": 1.16499734, "epoch": 0.0004809860213437547, "flos": 31319096135040.0, "grad_norm": 31.71613063643349, "language_loss": 1.43881059, "learning_rate": 1.338852977079528e-06, "loss": 1.48645568, "num_input_tokens_seen": 149210, "step": 8, "time_per_iteration": 3.172358751296997 }, { "auxiliary_loss_clip": 0.03368839, "auxiliary_loss_mlp": 0.01496105, "balance_loss_clip": 1.6120348, "balance_loss_mlp": 1.21229148, "epoch": 0.000541109274011724, "flos": 32160411463680.0, "grad_norm": 28.204490849684397, "language_loss": 1.4969244, "learning_rate": 1.4146878417666224e-06, "loss": 1.54557395, "num_input_tokens_seen": 169055, "step": 9, "time_per_iteration": 3.112215280532837 }, { "auxiliary_loss_clip": 0.03308365, "auxiliary_loss_mlp": 0.01475035, "balance_loss_clip": 1.61541438, "balance_loss_mlp": 1.20647991, "epoch": 0.0006012325266796934, "flos": 18916808163840.0, "grad_norm": 23.420774723604698, "language_loss": 1.44714785, "learning_rate": 1.4825244398280232e-06, "loss": 1.49498188, "num_input_tokens_seen": 188045, "step": 10, "time_per_iteration": 2.9495606422424316 }, { "auxiliary_loss_clip": 0.03364194, "auxiliary_loss_mlp": 0.01494262, "balance_loss_clip": 1.62042511, "balance_loss_mlp": 1.22036684, "epoch": 0.0006613557793476627, "flos": 20774861867520.0, "grad_norm": 18.353281468858004, "language_loss": 1.4520936, "learning_rate": 1.5438901072051983e-06, "loss": 1.50067806, "num_input_tokens_seen": 207035, "step": 11, "time_per_iteration": 3.0797431468963623 }, { "auxiliary_loss_clip": 0.03292683, "auxiliary_loss_mlp": 0.0145154, "balance_loss_clip": 1.60771322, "balance_loss_mlp": 1.17554641, "epoch": 0.000721479032015632, "flos": 16581680997120.0, "grad_norm": 16.61869254675767, "language_loss": 1.45121813, "learning_rate": 1.5999125722696629e-06, "loss": 1.49866033, "num_input_tokens_seen": 223225, "step": 12, "time_per_iteration": 2.9887659549713135 }, { "auxiliary_loss_clip": 0.03321669, "auxiliary_loss_mlp": 0.01405912, "balance_loss_clip": 1.61740756, "balance_loss_mlp": 1.14765704, "epoch": 0.0007816022846836014, "flos": 23805471144960.0, "grad_norm": 14.02187318243825, "language_loss": 1.23759985, "learning_rate": 1.6514482443788434e-06, "loss": 1.28487587, "num_input_tokens_seen": 242570, "step": 13, "time_per_iteration": 3.032742977142334 }, { "auxiliary_loss_clip": 0.03287474, "auxiliary_loss_mlp": 0.01470749, "balance_loss_clip": 1.61299658, "balance_loss_mlp": 1.20257616, "epoch": 0.0008417255373515708, "flos": 19172204841600.0, "grad_norm": 5.790568956401358, "language_loss": 1.20684385, "learning_rate": 1.6991628240650723e-06, "loss": 1.254426, "num_input_tokens_seen": 261215, "step": 14, "time_per_iteration": 3.002887487411499 }, { "auxiliary_loss_clip": 0.03272826, "auxiliary_loss_mlp": 0.01431255, "balance_loss_clip": 1.6181426, "balance_loss_mlp": 1.16804111, "epoch": 0.00090184879001954, "flos": 26395564026240.0, "grad_norm": 6.353887091300461, "language_loss": 1.12925518, "learning_rate": 1.7435840350181584e-06, "loss": 1.176296, "num_input_tokens_seen": 280035, "step": 15, "time_per_iteration": 3.0238780975341797 }, { "auxiliary_loss_clip": 0.03238489, "auxiliary_loss_mlp": 0.01411651, "balance_loss_clip": 1.60288334, "balance_loss_mlp": 1.16197944, "epoch": 0.0009619720426875094, "flos": 24679500785280.0, "grad_norm": 4.670310144758637, "language_loss": 1.11125767, "learning_rate": 1.7851373027727038e-06, "loss": 1.15775907, "num_input_tokens_seen": 300265, "step": 16, "time_per_iteration": 4.605847120285034 }, { "auxiliary_loss_clip": 0.03223993, "auxiliary_loss_mlp": 0.01417304, "balance_loss_clip": 1.60910368, "balance_loss_mlp": 1.17774093, "epoch": 0.0010220952953554788, "flos": 18624531196800.0, "grad_norm": 8.838429022323517, "language_loss": 1.12645221, "learning_rate": 1.8241705979033208e-06, "loss": 1.17286515, "num_input_tokens_seen": 317375, "step": 17, "time_per_iteration": 4.579033851623535 }, { "auxiliary_loss_clip": 0.03161492, "auxiliary_loss_mlp": 0.01379312, "balance_loss_clip": 1.60685277, "balance_loss_mlp": 1.1475693, "epoch": 0.001082218548023448, "flos": 26142537646080.0, "grad_norm": 3.823557061532633, "language_loss": 1.08069181, "learning_rate": 1.860972167459798e-06, "loss": 1.12609982, "num_input_tokens_seen": 337975, "step": 18, "time_per_iteration": 3.0132579803466797 }, { "auxiliary_loss_clip": 0.0318761, "auxiliary_loss_mlp": 0.01403306, "balance_loss_clip": 1.60585093, "balance_loss_mlp": 1.13799417, "epoch": 0.0011423418006914173, "flos": 19609776322560.0, "grad_norm": 4.403621106373983, "language_loss": 1.02445412, "learning_rate": 1.89578346593066e-06, "loss": 1.07036328, "num_input_tokens_seen": 356635, "step": 19, "time_per_iteration": 3.016176462173462 }, { "auxiliary_loss_clip": 0.0313029, "auxiliary_loss_mlp": 0.01342049, "balance_loss_clip": 1.60759044, "balance_loss_mlp": 1.12155962, "epoch": 0.0012024650533593868, "flos": 17895365107200.0, "grad_norm": 3.958333686933058, "language_loss": 1.16706228, "learning_rate": 1.928808765521199e-06, "loss": 1.21178555, "num_input_tokens_seen": 375625, "step": 20, "time_per_iteration": 3.0274486541748047 }, { "auxiliary_loss_clip": 0.03118109, "auxiliary_loss_mlp": 0.01378536, "balance_loss_clip": 1.58886433, "balance_loss_mlp": 1.1298182, "epoch": 0.001262588306027356, "flos": 21252043071360.0, "grad_norm": 4.333519066420982, "language_loss": 1.06129968, "learning_rate": 1.9602224192552076e-06, "loss": 1.10626626, "num_input_tokens_seen": 394350, "step": 21, "time_per_iteration": 2.9418578147888184 }, { "auxiliary_loss_clip": 0.03013912, "auxiliary_loss_mlp": 0.0137937, "balance_loss_clip": 1.57028937, "balance_loss_mlp": 1.14552903, "epoch": 0.0013227115586953253, "flos": 26104077158400.0, "grad_norm": 3.63841390311849, "language_loss": 1.05861485, "learning_rate": 1.9901744328983746e-06, "loss": 1.10254765, "num_input_tokens_seen": 413255, "step": 22, "time_per_iteration": 2.9651288986206055 }, { "auxiliary_loss_clip": 0.02966296, "auxiliary_loss_mlp": 0.01334065, "balance_loss_clip": 1.57175612, "balance_loss_mlp": 1.12377954, "epoch": 0.0013828348113632948, "flos": 23951376190080.0, "grad_norm": 2.8746130742538347, "language_loss": 0.9177655, "learning_rate": 2.018794797290208e-06, "loss": 0.96076906, "num_input_tokens_seen": 433065, "step": 23, "time_per_iteration": 3.049853563308716 }, { "auxiliary_loss_clip": 0.02932793, "auxiliary_loss_mlp": 0.01362183, "balance_loss_clip": 1.56404662, "balance_loss_mlp": 1.14236116, "epoch": 0.001442958064031264, "flos": 15959851724160.0, "grad_norm": 3.0897201135857735, "language_loss": 1.08192635, "learning_rate": 2.046196897962839e-06, "loss": 1.12487614, "num_input_tokens_seen": 451175, "step": 24, "time_per_iteration": 3.0543172359466553 }, { "auxiliary_loss_clip": 0.02823838, "auxiliary_loss_mlp": 0.01329007, "balance_loss_clip": 1.55692792, "balance_loss_mlp": 1.11853111, "epoch": 0.0015030813166992333, "flos": 18108350801280.0, "grad_norm": 4.111246686692462, "language_loss": 1.01367807, "learning_rate": 2.0724802282696944e-06, "loss": 1.05520654, "num_input_tokens_seen": 468775, "step": 25, "time_per_iteration": 3.0059614181518555 }, { "auxiliary_loss_clip": 0.02818207, "auxiliary_loss_mlp": 0.01309454, "balance_loss_clip": 1.55974329, "balance_loss_mlp": 1.10012197, "epoch": 0.0015632045693672028, "flos": 22234558763520.0, "grad_norm": 2.7163042439620018, "language_loss": 1.0669204, "learning_rate": 2.0977325700720194e-06, "loss": 1.10819697, "num_input_tokens_seen": 488530, "step": 26, "time_per_iteration": 3.1159534454345703 }, { "auxiliary_loss_clip": 0.0276047, "auxiliary_loss_mlp": 0.01325034, "balance_loss_clip": 1.54973662, "balance_loss_mlp": 1.12533486, "epoch": 0.001623327822035172, "flos": 23991955580160.0, "grad_norm": 2.562596284241794, "language_loss": 0.95537072, "learning_rate": 2.122031762649933e-06, "loss": 0.99622583, "num_input_tokens_seen": 510495, "step": 27, "time_per_iteration": 3.018643617630005 }, { "auxiliary_loss_clip": 0.02736222, "auxiliary_loss_mlp": 0.01311707, "balance_loss_clip": 1.55399776, "balance_loss_mlp": 1.13089037, "epoch": 0.0016834510747031415, "flos": 19677647070720.0, "grad_norm": 2.42975125432869, "language_loss": 1.06393945, "learning_rate": 2.1454471497582483e-06, "loss": 1.10441875, "num_input_tokens_seen": 528605, "step": 28, "time_per_iteration": 2.9263083934783936 }, { "auxiliary_loss_clip": 0.0270011, "auxiliary_loss_mlp": 0.0131913, "balance_loss_clip": 1.53841436, "balance_loss_mlp": 1.13297284, "epoch": 0.0017435743273711108, "flos": 20923819568640.0, "grad_norm": 4.42090805909513, "language_loss": 1.02493238, "learning_rate": 2.1680407726407727e-06, "loss": 1.06512475, "num_input_tokens_seen": 548515, "step": 29, "time_per_iteration": 3.0062997341156006 }, { "auxiliary_loss_clip": 0.0269246, "auxiliary_loss_mlp": 0.01312758, "balance_loss_clip": 1.53459728, "balance_loss_mlp": 1.12631428, "epoch": 0.00180369758003908, "flos": 19528976678400.0, "grad_norm": 3.1534114534186446, "language_loss": 1.19265521, "learning_rate": 2.189868360711334e-06, "loss": 1.23270726, "num_input_tokens_seen": 564025, "step": 30, "time_per_iteration": 2.931145429611206 }, { "auxiliary_loss_clip": 0.02610377, "auxiliary_loss_mlp": 0.01337183, "balance_loss_clip": 1.52116311, "balance_loss_mlp": 1.15665221, "epoch": 0.0018638208327070496, "flos": 27453169100160.0, "grad_norm": 2.735994596991484, "language_loss": 1.02616811, "learning_rate": 2.2109801597326265e-06, "loss": 1.06564379, "num_input_tokens_seen": 583345, "step": 31, "time_per_iteration": 2.993251085281372 }, { "auxiliary_loss_clip": 0.02582044, "auxiliary_loss_mlp": 0.01331305, "balance_loss_clip": 1.522609, "balance_loss_mlp": 1.15163302, "epoch": 0.0019239440853750188, "flos": 13589460380160.0, "grad_norm": 3.9056907796043654, "language_loss": 0.95266509, "learning_rate": 2.2314216284658796e-06, "loss": 0.99179864, "num_input_tokens_seen": 600010, "step": 32, "time_per_iteration": 2.9459571838378906 }, { "auxiliary_loss_clip": 0.02564836, "auxiliary_loss_mlp": 0.01302659, "balance_loss_clip": 1.51811624, "balance_loss_mlp": 1.13586164, "epoch": 0.001984067338042988, "flos": 11253866336640.0, "grad_norm": 3.226486022987097, "language_loss": 0.95143497, "learning_rate": 2.2512340280885094e-06, "loss": 0.99010992, "num_input_tokens_seen": 616295, "step": 33, "time_per_iteration": 2.9855570793151855 }, { "auxiliary_loss_clip": 0.02421202, "auxiliary_loss_mlp": 0.01304214, "balance_loss_clip": 1.48474145, "balance_loss_mlp": 1.14676213, "epoch": 0.0020441905907109576, "flos": 22386245898240.0, "grad_norm": 2.1714659525821247, "language_loss": 0.91547924, "learning_rate": 2.270454923596497e-06, "loss": 0.9527334, "num_input_tokens_seen": 637640, "step": 34, "time_per_iteration": 2.981541872024536 }, { "auxiliary_loss_clip": 0.02375249, "auxiliary_loss_mlp": 0.01271963, "balance_loss_clip": 1.45095515, "balance_loss_mlp": 1.11689591, "epoch": 0.0021043138433789266, "flos": 49778580337920.0, "grad_norm": 2.2635429103650386, "language_loss": 0.76603377, "learning_rate": 2.2891186125067434e-06, "loss": 0.80250585, "num_input_tokens_seen": 659710, "step": 35, "time_per_iteration": 3.2267208099365234 }, { "auxiliary_loss_clip": 0.02347187, "auxiliary_loss_mlp": 0.01276388, "balance_loss_clip": 1.46356034, "balance_loss_mlp": 1.13238275, "epoch": 0.002164437096046896, "flos": 20557961591040.0, "grad_norm": 2.3605884715298506, "language_loss": 0.88713098, "learning_rate": 2.307256493152974e-06, "loss": 0.92336679, "num_input_tokens_seen": 679670, "step": 36, "time_per_iteration": 2.948162078857422 }, { "auxiliary_loss_clip": 0.02289192, "auxiliary_loss_mlp": 0.01338204, "balance_loss_clip": 1.45043015, "balance_loss_mlp": 1.19105196, "epoch": 0.0022245603487148656, "flos": 26542295084160.0, "grad_norm": 2.4929063351918166, "language_loss": 0.93038809, "learning_rate": 2.3248973825097614e-06, "loss": 0.96666199, "num_input_tokens_seen": 700170, "step": 37, "time_per_iteration": 2.9556422233581543 }, { "auxiliary_loss_clip": 0.02249098, "auxiliary_loss_mlp": 0.01276785, "balance_loss_clip": 1.44485605, "balance_loss_mlp": 1.15500069, "epoch": 0.0022846836013828346, "flos": 20338188226560.0, "grad_norm": 2.177909778954084, "language_loss": 1.03952074, "learning_rate": 2.3420677916238357e-06, "loss": 1.07477951, "num_input_tokens_seen": 718545, "step": 38, "time_per_iteration": 2.9959065914154053 }, { "auxiliary_loss_clip": 0.02216028, "auxiliary_loss_mlp": 0.01260768, "balance_loss_clip": 1.43807542, "balance_loss_mlp": 1.13726676, "epoch": 0.002344806854050804, "flos": 26247575992320.0, "grad_norm": 2.22652515093943, "language_loss": 0.85297108, "learning_rate": 2.358792165262154e-06, "loss": 0.887739, "num_input_tokens_seen": 739865, "step": 39, "time_per_iteration": 3.035399913787842 }, { "auxiliary_loss_clip": 0.02192275, "auxiliary_loss_mlp": 0.01250434, "balance_loss_clip": 1.4289664, "balance_loss_mlp": 1.12216496, "epoch": 0.0024049301067187736, "flos": 11801539981440.0, "grad_norm": 3.258228308703562, "language_loss": 0.90279335, "learning_rate": 2.3750930912143747e-06, "loss": 0.93722045, "num_input_tokens_seen": 755770, "step": 40, "time_per_iteration": 3.060368299484253 }, { "auxiliary_loss_clip": 0.02142113, "auxiliary_loss_mlp": 0.01273783, "balance_loss_clip": 1.41895449, "balance_loss_mlp": 1.16086745, "epoch": 0.0024650533593867426, "flos": 20631506688000.0, "grad_norm": 3.245861029799582, "language_loss": 0.93271625, "learning_rate": 2.3909914837471044e-06, "loss": 0.9668752, "num_input_tokens_seen": 773440, "step": 41, "time_per_iteration": 2.9518353939056396 }, { "auxiliary_loss_clip": 0.02105753, "auxiliary_loss_mlp": 0.01254821, "balance_loss_clip": 1.41097844, "balance_loss_mlp": 1.15168142, "epoch": 0.002525176612054712, "flos": 18406122549120.0, "grad_norm": 3.3039479788253536, "language_loss": 0.97533798, "learning_rate": 2.4065067449483835e-06, "loss": 1.0089438, "num_input_tokens_seen": 790455, "step": 42, "time_per_iteration": 2.933177947998047 }, { "auxiliary_loss_clip": 0.020675, "auxiliary_loss_mlp": 0.01298422, "balance_loss_clip": 1.41198874, "balance_loss_mlp": 1.19189644, "epoch": 0.0025852998647226816, "flos": 28184023128960.0, "grad_norm": 3.15071165872949, "language_loss": 0.97562659, "learning_rate": 2.4216569070848724e-06, "loss": 1.00928593, "num_input_tokens_seen": 810645, "step": 43, "time_per_iteration": 2.9760589599609375 }, { "auxiliary_loss_clip": 0.02086351, "auxiliary_loss_mlp": 0.01314601, "balance_loss_clip": 1.41042757, "balance_loss_mlp": 1.20283043, "epoch": 0.0026454231173906506, "flos": 14283110897280.0, "grad_norm": 2.3612650137146574, "language_loss": 0.93435001, "learning_rate": 2.4364587585915504e-06, "loss": 0.96835947, "num_input_tokens_seen": 827470, "step": 44, "time_per_iteration": 2.9239895343780518 }, { "auxiliary_loss_clip": 0.02043996, "auxiliary_loss_mlp": 0.01272131, "balance_loss_clip": 1.40557313, "balance_loss_mlp": 1.17399764, "epoch": 0.00270554637005862, "flos": 22419211605120.0, "grad_norm": 2.1476860292916644, "language_loss": 0.98677421, "learning_rate": 2.450927955901469e-06, "loss": 1.01993537, "num_input_tokens_seen": 847285, "step": 45, "time_per_iteration": 2.9626305103302 }, { "auxiliary_loss_clip": 0.02018804, "auxiliary_loss_mlp": 0.01228873, "balance_loss_clip": 1.39126372, "balance_loss_mlp": 1.14208817, "epoch": 0.0027656696227265896, "flos": 23985778440960.0, "grad_norm": 1.8862192248435494, "language_loss": 1.02800822, "learning_rate": 2.465079122983384e-06, "loss": 1.06048501, "num_input_tokens_seen": 867545, "step": 46, "time_per_iteration": 2.9913573265075684 }, { "auxiliary_loss_clip": 0.0198766, "auxiliary_loss_mlp": 0.01272862, "balance_loss_clip": 1.38388658, "balance_loss_mlp": 1.182549, "epoch": 0.0028257928753945586, "flos": 37669503087360.0, "grad_norm": 2.1076645953887696, "language_loss": 0.87839413, "learning_rate": 2.4789259401737868e-06, "loss": 0.9109993, "num_input_tokens_seen": 889915, "step": 47, "time_per_iteration": 3.0189881324768066 }, { "auxiliary_loss_clip": 0.01949271, "auxiliary_loss_mlp": 0.01255947, "balance_loss_clip": 1.37360096, "balance_loss_mlp": 1.16963911, "epoch": 0.002885916128062528, "flos": 22454547609600.0, "grad_norm": 4.4561049138068, "language_loss": 0.87809587, "learning_rate": 2.492481223656015e-06, "loss": 0.91014802, "num_input_tokens_seen": 908975, "step": 48, "time_per_iteration": 2.863565444946289 }, { "auxiliary_loss_clip": 0.01949016, "auxiliary_loss_mlp": 0.0124182, "balance_loss_clip": 1.36337733, "balance_loss_mlp": 1.15069616, "epoch": 0.0029460393807304976, "flos": 27012796358400.0, "grad_norm": 2.9451035624229855, "language_loss": 0.89691317, "learning_rate": 2.5057569967437924e-06, "loss": 0.9288215, "num_input_tokens_seen": 929810, "step": 49, "time_per_iteration": 2.9967453479766846 }, { "auxiliary_loss_clip": 0.0194038, "auxiliary_loss_mlp": 0.01234077, "balance_loss_clip": 1.35742152, "balance_loss_mlp": 1.14996314, "epoch": 0.0030061626333984666, "flos": 15851832549120.0, "grad_norm": 3.162716210197168, "language_loss": 0.90914285, "learning_rate": 2.51876455396287e-06, "loss": 0.94088745, "num_input_tokens_seen": 948650, "step": 50, "time_per_iteration": 2.8832523822784424 }, { "auxiliary_loss_clip": 0.01938537, "auxiliary_loss_mlp": 0.01199505, "balance_loss_clip": 1.36240602, "balance_loss_mlp": 1.11844242, "epoch": 0.003066285886066436, "flos": 31827052316160.0, "grad_norm": 6.098010360158733, "language_loss": 0.86977792, "learning_rate": 2.5315145187866316e-06, "loss": 0.90115827, "num_input_tokens_seen": 966455, "step": 51, "time_per_iteration": 2.9061717987060547 }, { "auxiliary_loss_clip": 0.01895637, "auxiliary_loss_mlp": 0.01206588, "balance_loss_clip": 1.35252357, "balance_loss_mlp": 1.12829173, "epoch": 0.0031264091387344056, "flos": 41427482774400.0, "grad_norm": 2.043292881862276, "language_loss": 0.95171362, "learning_rate": 2.5440168957651953e-06, "loss": 0.98273587, "num_input_tokens_seen": 988110, "step": 52, "time_per_iteration": 3.0266616344451904 }, { "auxiliary_loss_clip": 0.01893195, "auxiliary_loss_mlp": 0.01241159, "balance_loss_clip": 1.34894896, "balance_loss_mlp": 1.16162264, "epoch": 0.0031865323914023747, "flos": 23440941970560.0, "grad_norm": 4.2358840345824635, "language_loss": 0.92323011, "learning_rate": 2.5562811176888872e-06, "loss": 0.95457363, "num_input_tokens_seen": 1008550, "step": 53, "time_per_iteration": 2.8850226402282715 }, { "auxiliary_loss_clip": 0.01882736, "auxiliary_loss_mlp": 0.01197045, "balance_loss_clip": 1.35264134, "balance_loss_mlp": 1.11669779, "epoch": 0.003246655644070344, "flos": 14429195510400.0, "grad_norm": 2.290226623360683, "language_loss": 0.8260113, "learning_rate": 2.5683160883431093e-06, "loss": 0.85680908, "num_input_tokens_seen": 1026840, "step": 54, "time_per_iteration": 2.9433553218841553 }, { "auxiliary_loss_clip": 0.01880073, "auxiliary_loss_mlp": 0.01210775, "balance_loss_clip": 1.34162152, "balance_loss_mlp": 1.13233542, "epoch": 0.0033067788967383136, "flos": 35918247496320.0, "grad_norm": 2.911577423572303, "language_loss": 0.81303245, "learning_rate": 2.580130221340046e-06, "loss": 0.84394085, "num_input_tokens_seen": 1048875, "step": 55, "time_per_iteration": 3.0040643215179443 }, { "auxiliary_loss_clip": 0.01870075, "auxiliary_loss_mlp": 0.0120375, "balance_loss_clip": 1.33644819, "balance_loss_mlp": 1.12521541, "epoch": 0.003366902149406283, "flos": 22958732862720.0, "grad_norm": 2.639118679342801, "language_loss": 0.87089968, "learning_rate": 2.5917314754514246e-06, "loss": 0.90163803, "num_input_tokens_seen": 1066435, "step": 56, "time_per_iteration": 2.830453395843506 }, { "auxiliary_loss_clip": 0.01869912, "auxiliary_loss_mlp": 0.01161425, "balance_loss_clip": 1.32921791, "balance_loss_mlp": 1.08851671, "epoch": 0.003427025402074252, "flos": 26582838560640.0, "grad_norm": 2.101574700040827, "language_loss": 0.92890096, "learning_rate": 2.6031273868139713e-06, "loss": 0.95921433, "num_input_tokens_seen": 1090330, "step": 57, "time_per_iteration": 7.0071024894714355 }, { "auxiliary_loss_clip": 0.01833802, "auxiliary_loss_mlp": 0.0121675, "balance_loss_clip": 1.33333457, "balance_loss_mlp": 1.14493799, "epoch": 0.0034871486547422216, "flos": 23951196622080.0, "grad_norm": 14.610065921505914, "language_loss": 0.9972856, "learning_rate": 2.614325098333948e-06, "loss": 1.02779114, "num_input_tokens_seen": 1109840, "step": 58, "time_per_iteration": 2.830960273742676 }, { "auxiliary_loss_clip": 0.0181804, "auxiliary_loss_mlp": 0.01199311, "balance_loss_clip": 1.32073379, "balance_loss_mlp": 1.12835753, "epoch": 0.003547271907410191, "flos": 21214983214080.0, "grad_norm": 2.120622270947527, "language_loss": 0.88172519, "learning_rate": 2.625331386578098e-06, "loss": 0.91189873, "num_input_tokens_seen": 1128415, "step": 59, "time_per_iteration": 2.8507089614868164 }, { "auxiliary_loss_clip": 0.01839573, "auxiliary_loss_mlp": 0.01163328, "balance_loss_clip": 1.32924581, "balance_loss_mlp": 1.09075332, "epoch": 0.00360739516007816, "flos": 16504903676160.0, "grad_norm": 2.021991994360373, "language_loss": 0.93542433, "learning_rate": 2.63615268640451e-06, "loss": 0.96545339, "num_input_tokens_seen": 1146515, "step": 60, "time_per_iteration": 2.8517534732818604 }, { "auxiliary_loss_clip": 0.0181893, "auxiliary_loss_mlp": 0.01176948, "balance_loss_clip": 1.31414318, "balance_loss_mlp": 1.10923755, "epoch": 0.0036675184127461296, "flos": 19464805031040.0, "grad_norm": 2.908283338489548, "language_loss": 0.90021706, "learning_rate": 2.6467951135575943e-06, "loss": 0.9301759, "num_input_tokens_seen": 1166330, "step": 61, "time_per_iteration": 2.8853390216827393 }, { "auxiliary_loss_clip": 0.01803943, "auxiliary_loss_mlp": 0.01142904, "balance_loss_clip": 1.31131864, "balance_loss_mlp": 1.07581341, "epoch": 0.003727641665414099, "flos": 20957323979520.0, "grad_norm": 1.8428161811646855, "language_loss": 0.88479733, "learning_rate": 2.657264485425803e-06, "loss": 0.91426575, "num_input_tokens_seen": 1186010, "step": 62, "time_per_iteration": 2.8860812187194824 }, { "auxiliary_loss_clip": 0.01785338, "auxiliary_loss_mlp": 0.0116457, "balance_loss_clip": 1.30233741, "balance_loss_mlp": 1.09504724, "epoch": 0.003787764918082068, "flos": 18406050721920.0, "grad_norm": 2.4385306002926512, "language_loss": 0.96280968, "learning_rate": 2.6675663401385186e-06, "loss": 0.99230874, "num_input_tokens_seen": 1204985, "step": 63, "time_per_iteration": 2.9081404209136963 }, { "auxiliary_loss_clip": 0.01795068, "auxiliary_loss_mlp": 0.01171321, "balance_loss_clip": 1.31071985, "balance_loss_mlp": 1.10499322, "epoch": 0.0038478881707500376, "flos": 12459243962880.0, "grad_norm": 3.0781639748926697, "language_loss": 0.98840165, "learning_rate": 2.677705954159056e-06, "loss": 1.01806557, "num_input_tokens_seen": 1223545, "step": 64, "time_per_iteration": 2.893603801727295 }, { "auxiliary_loss_clip": 0.01801311, "auxiliary_loss_mlp": 0.01151112, "balance_loss_clip": 1.30960393, "balance_loss_mlp": 1.08368695, "epoch": 0.003908011423418007, "flos": 13553334276480.0, "grad_norm": 2.4813676281781554, "language_loss": 0.85397774, "learning_rate": 2.6876883585136904e-06, "loss": 0.88350195, "num_input_tokens_seen": 1241175, "step": 65, "time_per_iteration": 2.8768796920776367 }, { "auxiliary_loss_clip": 0.01777474, "auxiliary_loss_mlp": 0.01155217, "balance_loss_clip": 1.29563761, "balance_loss_mlp": 1.087888, "epoch": 0.003968134676085976, "flos": 18333475292160.0, "grad_norm": 1.8550079005121831, "language_loss": 0.85281348, "learning_rate": 2.697518353781685e-06, "loss": 0.88214046, "num_input_tokens_seen": 1259315, "step": 66, "time_per_iteration": 2.769274950027466 }, { "auxiliary_loss_clip": 0.01779987, "auxiliary_loss_mlp": 0.01151372, "balance_loss_clip": 1.29312515, "balance_loss_mlp": 1.07650828, "epoch": 0.004028257928753946, "flos": 20485242506880.0, "grad_norm": 2.74895944689593, "language_loss": 0.96567476, "learning_rate": 2.7072005239581103e-06, "loss": 0.99498826, "num_input_tokens_seen": 1277055, "step": 67, "time_per_iteration": 2.889369249343872 }, { "auxiliary_loss_clip": 0.01752442, "auxiliary_loss_mlp": 0.01152779, "balance_loss_clip": 1.28765118, "balance_loss_mlp": 1.08120584, "epoch": 0.004088381181421915, "flos": 18843837684480.0, "grad_norm": 2.109359538419204, "language_loss": 0.94516367, "learning_rate": 2.7167392492896727e-06, "loss": 0.97421581, "num_input_tokens_seen": 1294355, "step": 68, "time_per_iteration": 2.8107409477233887 }, { "auxiliary_loss_clip": 0.01747204, "auxiliary_loss_mlp": 0.0115424, "balance_loss_clip": 1.28511512, "balance_loss_mlp": 1.08476448, "epoch": 0.004148504434089885, "flos": 19427817000960.0, "grad_norm": 2.2931216646069092, "language_loss": 0.96014255, "learning_rate": 2.7261387181735195e-06, "loss": 0.98915702, "num_input_tokens_seen": 1313525, "step": 69, "time_per_iteration": 2.8138387203216553 }, { "auxiliary_loss_clip": 0.01741342, "auxiliary_loss_mlp": 0.01160375, "balance_loss_clip": 1.28807163, "balance_loss_mlp": 1.09581161, "epoch": 0.004208627686757853, "flos": 20811023884800.0, "grad_norm": 2.1764096137707494, "language_loss": 0.98070192, "learning_rate": 2.7354029381999196e-06, "loss": 1.00971913, "num_input_tokens_seen": 1330505, "step": 70, "time_per_iteration": 2.8319084644317627 }, { "auxiliary_loss_clip": 0.0174721, "auxiliary_loss_mlp": 0.01145619, "balance_loss_clip": 1.27791202, "balance_loss_mlp": 1.07685876, "epoch": 0.004268750939425823, "flos": 19098623831040.0, "grad_norm": 2.9300158782571324, "language_loss": 0.94016141, "learning_rate": 2.7445357464116983e-06, "loss": 0.96908975, "num_input_tokens_seen": 1349615, "step": 71, "time_per_iteration": 2.8469433784484863 }, { "auxiliary_loss_clip": 0.01815227, "auxiliary_loss_mlp": 0.01294388, "balance_loss_clip": 1.43495834, "balance_loss_mlp": 1.25490558, "epoch": 0.004328874192093792, "flos": 52439635514880.0, "grad_norm": 2.409331683106634, "language_loss": 0.65682542, "learning_rate": 2.75354081884615e-06, "loss": 0.68792164, "num_input_tokens_seen": 1410275, "step": 72, "time_per_iteration": 3.2019593715667725 }, { "auxiliary_loss_clip": 0.01799527, "auxiliary_loss_mlp": 0.01271558, "balance_loss_clip": 1.43197393, "balance_loss_mlp": 1.2316941, "epoch": 0.004388997444761762, "flos": 66473239564800.0, "grad_norm": 2.25068040880696, "language_loss": 0.63694263, "learning_rate": 2.7624216794188286e-06, "loss": 0.66765356, "num_input_tokens_seen": 1473020, "step": 73, "time_per_iteration": 3.3545596599578857 }, { "auxiliary_loss_clip": 0.01720805, "auxiliary_loss_mlp": 0.01140553, "balance_loss_clip": 1.26912856, "balance_loss_mlp": 1.07279444, "epoch": 0.004449120697429731, "flos": 18952970181120.0, "grad_norm": 2.554977860093902, "language_loss": 0.86212188, "learning_rate": 2.771181708202938e-06, "loss": 0.89073551, "num_input_tokens_seen": 1490385, "step": 74, "time_per_iteration": 2.823498487472534 }, { "auxiliary_loss_clip": 0.0172287, "auxiliary_loss_mlp": 0.01162493, "balance_loss_clip": 1.26811171, "balance_loss_mlp": 1.09344697, "epoch": 0.004509243950097701, "flos": 21105491581440.0, "grad_norm": 3.0087618017840105, "language_loss": 0.97196102, "learning_rate": 2.779824149153005e-06, "loss": 1.00081468, "num_input_tokens_seen": 1509725, "step": 75, "time_per_iteration": 2.888415575027466 }, { "auxiliary_loss_clip": 0.0170198, "auxiliary_loss_mlp": 0.01142315, "balance_loss_clip": 1.26420689, "balance_loss_mlp": 1.07608271, "epoch": 0.004569367202765669, "flos": 20698730991360.0, "grad_norm": 2.6610382542709043, "language_loss": 0.87740695, "learning_rate": 2.788352117317012e-06, "loss": 0.90584993, "num_input_tokens_seen": 1527245, "step": 76, "time_per_iteration": 2.9226863384246826 }, { "auxiliary_loss_clip": 0.01702512, "auxiliary_loss_mlp": 0.01145374, "balance_loss_clip": 1.26239479, "balance_loss_mlp": 1.07656646, "epoch": 0.004629490455433639, "flos": 28658474899200.0, "grad_norm": 2.4272090643104574, "language_loss": 0.91791159, "learning_rate": 2.796768605577095e-06, "loss": 0.94639051, "num_input_tokens_seen": 1548930, "step": 77, "time_per_iteration": 2.8720929622650146 }, { "auxiliary_loss_clip": 0.01693018, "auxiliary_loss_mlp": 0.01165978, "balance_loss_clip": 1.26398146, "balance_loss_mlp": 1.09569168, "epoch": 0.004689613708101608, "flos": 11072409805440.0, "grad_norm": 2.2822185142383034, "language_loss": 0.9211635, "learning_rate": 2.80507649095533e-06, "loss": 0.94975346, "num_input_tokens_seen": 1565695, "step": 78, "time_per_iteration": 2.7832391262054443 }, { "auxiliary_loss_clip": 0.01689271, "auxiliary_loss_mlp": 0.01153255, "balance_loss_clip": 1.25836253, "balance_loss_mlp": 1.08482933, "epoch": 0.004749736960769578, "flos": 21799106184960.0, "grad_norm": 2.263191265943929, "language_loss": 0.82771945, "learning_rate": 2.813278540517843e-06, "loss": 0.85614467, "num_input_tokens_seen": 1582625, "step": 79, "time_per_iteration": 2.7723355293273926 }, { "auxiliary_loss_clip": 0.01702468, "auxiliary_loss_mlp": 0.01130708, "balance_loss_clip": 1.26147008, "balance_loss_mlp": 1.0609467, "epoch": 0.004809860213437547, "flos": 19792597570560.0, "grad_norm": 1.9992491725405546, "language_loss": 0.91272199, "learning_rate": 2.8213774169075505e-06, "loss": 0.94105375, "num_input_tokens_seen": 1601725, "step": 80, "time_per_iteration": 2.742046356201172 }, { "auxiliary_loss_clip": 0.01671156, "auxiliary_loss_mlp": 0.01144048, "balance_loss_clip": 1.25365841, "balance_loss_mlp": 1.07371473, "epoch": 0.004869983466105517, "flos": 26574327037440.0, "grad_norm": 2.0371265012476742, "language_loss": 0.95241439, "learning_rate": 2.829375683533245e-06, "loss": 0.9805665, "num_input_tokens_seen": 1622420, "step": 81, "time_per_iteration": 2.8996386528015137 }, { "auxiliary_loss_clip": 0.01686092, "auxiliary_loss_mlp": 0.01147828, "balance_loss_clip": 1.25779653, "balance_loss_mlp": 1.08149946, "epoch": 0.004930106718773485, "flos": 12823378087680.0, "grad_norm": 2.9441337112970296, "language_loss": 0.96288472, "learning_rate": 2.8372758094402803e-06, "loss": 0.99122393, "num_input_tokens_seen": 1640715, "step": 82, "time_per_iteration": 2.819120407104492 }, { "auxiliary_loss_clip": 0.01668255, "auxiliary_loss_mlp": 0.01156428, "balance_loss_clip": 1.2461338, "balance_loss_mlp": 1.08709574, "epoch": 0.004990229971441455, "flos": 25774919902080.0, "grad_norm": 2.6601797838877856, "language_loss": 0.86762071, "learning_rate": 2.84508017388607e-06, "loss": 0.89586747, "num_input_tokens_seen": 1662210, "step": 83, "time_per_iteration": 2.7959344387054443 }, { "auxiliary_loss_clip": 0.01662665, "auxiliary_loss_mlp": 0.01154043, "balance_loss_clip": 1.24844718, "balance_loss_mlp": 1.084234, "epoch": 0.005050353224109424, "flos": 17457254922240.0, "grad_norm": 2.5416281292503986, "language_loss": 0.92081314, "learning_rate": 2.852791070641559e-06, "loss": 0.94898021, "num_input_tokens_seen": 1681070, "step": 84, "time_per_iteration": 2.7176246643066406 }, { "auxiliary_loss_clip": 0.01647627, "auxiliary_loss_mlp": 0.01154949, "balance_loss_clip": 1.36429358, "balance_loss_mlp": 1.11527622, "epoch": 0.005110476476777394, "flos": 69805460367360.0, "grad_norm": 1.4023430227621099, "language_loss": 0.6252538, "learning_rate": 2.8604107120381682e-06, "loss": 0.65327954, "num_input_tokens_seen": 1747140, "step": 85, "time_per_iteration": 3.296835422515869 }, { "auxiliary_loss_clip": 0.01649469, "auxiliary_loss_mlp": 0.0112642, "balance_loss_clip": 1.23797417, "balance_loss_mlp": 1.05642033, "epoch": 0.005170599729445363, "flos": 24790105739520.0, "grad_norm": 1.805253124779358, "language_loss": 0.90709531, "learning_rate": 2.8679412327780482e-06, "loss": 0.93485421, "num_input_tokens_seen": 1767475, "step": 86, "time_per_iteration": 2.761484146118164 }, { "auxiliary_loss_clip": 0.01653351, "auxiliary_loss_mlp": 0.01158608, "balance_loss_clip": 1.24437881, "balance_loss_mlp": 1.08741617, "epoch": 0.005230722982113333, "flos": 23258048895360.0, "grad_norm": 2.3398213465495776, "language_loss": 0.81961077, "learning_rate": 2.8753846935240833e-06, "loss": 0.8477304, "num_input_tokens_seen": 1784980, "step": 87, "time_per_iteration": 2.763185739517212 }, { "auxiliary_loss_clip": 0.01641581, "auxiliary_loss_mlp": 0.01152623, "balance_loss_clip": 1.24129367, "balance_loss_mlp": 1.08457828, "epoch": 0.005290846234781301, "flos": 16727909264640.0, "grad_norm": 3.1951080427559857, "language_loss": 0.95790672, "learning_rate": 2.8827430842847267e-06, "loss": 0.98584872, "num_input_tokens_seen": 1803030, "step": 88, "time_per_iteration": 2.7855517864227295 }, { "auxiliary_loss_clip": 0.01658657, "auxiliary_loss_mlp": 0.01147064, "balance_loss_clip": 1.24130976, "balance_loss_mlp": 1.07978201, "epoch": 0.005350969487449271, "flos": 20886077352960.0, "grad_norm": 3.405407923072192, "language_loss": 0.86023164, "learning_rate": 2.8900183276075957e-06, "loss": 0.88828892, "num_input_tokens_seen": 1822865, "step": 89, "time_per_iteration": 2.7517924308776855 }, { "auxiliary_loss_clip": 0.01647446, "auxiliary_loss_mlp": 0.01133456, "balance_loss_clip": 1.23541856, "balance_loss_mlp": 1.06727123, "epoch": 0.00541109274011724, "flos": 26209977431040.0, "grad_norm": 2.130771496386599, "language_loss": 0.9150058, "learning_rate": 2.8972122815946455e-06, "loss": 0.94281483, "num_input_tokens_seen": 1842435, "step": 90, "time_per_iteration": 2.7526872158050537 }, { "auxiliary_loss_clip": 0.01629409, "auxiliary_loss_mlp": 0.01133822, "balance_loss_clip": 1.23219132, "balance_loss_mlp": 1.06582534, "epoch": 0.00547121599278521, "flos": 21178569801600.0, "grad_norm": 2.6928798867856796, "language_loss": 0.86073506, "learning_rate": 2.90432674275074e-06, "loss": 0.88836741, "num_input_tokens_seen": 1860065, "step": 91, "time_per_iteration": 2.7995588779449463 }, { "auxiliary_loss_clip": 0.01628638, "auxiliary_loss_mlp": 0.01138916, "balance_loss_clip": 1.22774827, "balance_loss_mlp": 1.07335091, "epoch": 0.005531339245453179, "flos": 19718801078400.0, "grad_norm": 5.062847798051961, "language_loss": 0.87041199, "learning_rate": 2.91136344867656e-06, "loss": 0.8980875, "num_input_tokens_seen": 1878135, "step": 92, "time_per_iteration": 2.7813079357147217 }, { "auxiliary_loss_clip": 0.01620799, "auxiliary_loss_mlp": 0.01174163, "balance_loss_clip": 1.21933174, "balance_loss_mlp": 1.10650027, "epoch": 0.005591462498121149, "flos": 17636089760640.0, "grad_norm": 4.340668874696889, "language_loss": 0.9210887, "learning_rate": 2.918324080615938e-06, "loss": 0.94903833, "num_input_tokens_seen": 1894895, "step": 93, "time_per_iteration": 2.7582218647003174 }, { "auxiliary_loss_clip": 0.0163427, "auxiliary_loss_mlp": 0.01153574, "balance_loss_clip": 1.22659743, "balance_loss_mlp": 1.08238208, "epoch": 0.005651585750789117, "flos": 20011221699840.0, "grad_norm": 4.327341326162078, "language_loss": 0.87578797, "learning_rate": 2.925210265866963e-06, "loss": 0.90366644, "num_input_tokens_seen": 1913220, "step": 94, "time_per_iteration": 2.783581256866455 }, { "auxiliary_loss_clip": 0.01570285, "auxiliary_loss_mlp": 0.01051726, "balance_loss_clip": 1.31970167, "balance_loss_mlp": 1.01376939, "epoch": 0.005711709003457087, "flos": 59812957981440.0, "grad_norm": 1.3608185384271176, "language_loss": 0.68098927, "learning_rate": 2.932023580065507e-06, "loss": 0.70720935, "num_input_tokens_seen": 1970970, "step": 95, "time_per_iteration": 3.1328847408294678 }, { "auxiliary_loss_clip": 0.01612519, "auxiliary_loss_mlp": 0.01150182, "balance_loss_clip": 1.21488237, "balance_loss_mlp": 1.08318627, "epoch": 0.005771832256125056, "flos": 15559591495680.0, "grad_norm": 6.736145376327001, "language_loss": 0.90221369, "learning_rate": 2.9387655493491906e-06, "loss": 0.92984068, "num_input_tokens_seen": 1988930, "step": 96, "time_per_iteration": 2.8015241622924805 }, { "auxiliary_loss_clip": 0.01605814, "auxiliary_loss_mlp": 0.01142022, "balance_loss_clip": 1.21851277, "balance_loss_mlp": 1.08003318, "epoch": 0.005831955508793026, "flos": 22528380015360.0, "grad_norm": 3.8307865500968044, "language_loss": 0.89869905, "learning_rate": 2.9454376524092147e-06, "loss": 0.92617744, "num_input_tokens_seen": 2006285, "step": 97, "time_per_iteration": 4.387299060821533 }, { "auxiliary_loss_clip": 0.01593214, "auxiliary_loss_mlp": 0.01140673, "balance_loss_clip": 1.2102325, "balance_loss_mlp": 1.07200789, "epoch": 0.005892078761460995, "flos": 22049834094720.0, "grad_norm": 2.291581893082518, "language_loss": 0.76274347, "learning_rate": 2.952041322436969e-06, "loss": 0.79008234, "num_input_tokens_seen": 2024905, "step": 98, "time_per_iteration": 2.751507043838501 }, { "auxiliary_loss_clip": 0.01533926, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.29271698, "balance_loss_mlp": 1.00129879, "epoch": 0.005952202014128965, "flos": 68539143317760.0, "grad_norm": 1.0388395506080574, "language_loss": 0.65518898, "learning_rate": 2.9585779489718204e-06, "loss": 0.68089598, "num_input_tokens_seen": 2086220, "step": 99, "time_per_iteration": 3.3125040531158447 }, { "auxiliary_loss_clip": 0.01595694, "auxiliary_loss_mlp": 0.01142556, "balance_loss_clip": 1.21028757, "balance_loss_mlp": 1.07217503, "epoch": 0.006012325266796933, "flos": 22960887678720.0, "grad_norm": 2.051483688350497, "language_loss": 0.90885437, "learning_rate": 2.9650488796560464e-06, "loss": 0.93623686, "num_input_tokens_seen": 2103365, "step": 100, "time_per_iteration": 2.7632548809051514 }, { "auxiliary_loss_clip": 0.01607235, "auxiliary_loss_mlp": 0.01150276, "balance_loss_clip": 1.21294045, "balance_loss_mlp": 1.08394814, "epoch": 0.006072448519464903, "flos": 17347942857600.0, "grad_norm": 2.0181737234491566, "language_loss": 0.91081136, "learning_rate": 2.971455421902446e-06, "loss": 0.9383865, "num_input_tokens_seen": 2121995, "step": 101, "time_per_iteration": 2.7214279174804688 }, { "auxiliary_loss_clip": 0.015938, "auxiliary_loss_mlp": 0.01152009, "balance_loss_clip": 1.21248627, "balance_loss_mlp": 1.08124638, "epoch": 0.006132571772132872, "flos": 24681116897280.0, "grad_norm": 2.076276442041171, "language_loss": 0.90774924, "learning_rate": 2.9777988444798075e-06, "loss": 0.93520737, "num_input_tokens_seen": 2141815, "step": 102, "time_per_iteration": 2.8389108180999756 }, { "auxiliary_loss_clip": 0.01588155, "auxiliary_loss_mlp": 0.01133785, "balance_loss_clip": 1.20914173, "balance_loss_mlp": 1.06912589, "epoch": 0.006192695024800842, "flos": 21465675210240.0, "grad_norm": 2.3272829989328456, "language_loss": 0.88006896, "learning_rate": 2.9840803790210285e-06, "loss": 0.90728837, "num_input_tokens_seen": 2161125, "step": 103, "time_per_iteration": 2.768784761428833 }, { "auxiliary_loss_clip": 0.01588751, "auxiliary_loss_mlp": 0.01136216, "balance_loss_clip": 1.21138883, "balance_loss_mlp": 1.06998372, "epoch": 0.006252818277468811, "flos": 17420410546560.0, "grad_norm": 1.9182889224259552, "language_loss": 0.93644351, "learning_rate": 2.990301221458371e-06, "loss": 0.96369314, "num_input_tokens_seen": 2179510, "step": 104, "time_per_iteration": 2.7109038829803467 }, { "auxiliary_loss_clip": 0.01579421, "auxiliary_loss_mlp": 0.01146524, "balance_loss_clip": 1.20086741, "balance_loss_mlp": 1.08258009, "epoch": 0.006312941530136781, "flos": 19099557584640.0, "grad_norm": 3.0437899698059367, "language_loss": 0.96655375, "learning_rate": 2.9964625333900544e-06, "loss": 0.99381316, "num_input_tokens_seen": 2197870, "step": 105, "time_per_iteration": 2.7254133224487305 }, { "auxiliary_loss_clip": 0.01578331, "auxiliary_loss_mlp": 0.01158544, "balance_loss_clip": 1.20144236, "balance_loss_mlp": 1.08768642, "epoch": 0.006373064782804749, "flos": 24060831909120.0, "grad_norm": 3.1837681777002302, "language_loss": 0.87119448, "learning_rate": 3.002565443382063e-06, "loss": 0.89856327, "num_input_tokens_seen": 2217495, "step": 106, "time_per_iteration": 2.7705447673797607 }, { "auxiliary_loss_clip": 0.01561845, "auxiliary_loss_mlp": 0.01143018, "balance_loss_clip": 1.18746924, "balance_loss_mlp": 1.0751636, "epoch": 0.006433188035472719, "flos": 18332433797760.0, "grad_norm": 2.228856706842439, "language_loss": 0.83398581, "learning_rate": 3.008611048208843e-06, "loss": 0.86103439, "num_input_tokens_seen": 2236520, "step": 107, "time_per_iteration": 2.6885263919830322 }, { "auxiliary_loss_clip": 0.01469631, "auxiliary_loss_mlp": 0.0103327, "balance_loss_clip": 1.25210869, "balance_loss_mlp": 1.00179863, "epoch": 0.006493311288140688, "flos": 62562387594240.0, "grad_norm": 0.9900995959758047, "language_loss": 0.64796811, "learning_rate": 3.014600414036285e-06, "loss": 0.67299712, "num_input_tokens_seen": 2300140, "step": 108, "time_per_iteration": 3.278621196746826 }, { "auxiliary_loss_clip": 0.01552898, "auxiliary_loss_mlp": 0.01132858, "balance_loss_clip": 1.18960094, "balance_loss_mlp": 1.06424141, "epoch": 0.006553434540808658, "flos": 19500141035520.0, "grad_norm": 2.019247660217844, "language_loss": 0.97709465, "learning_rate": 3.0205345775501937e-06, "loss": 1.00395215, "num_input_tokens_seen": 2317320, "step": 109, "time_per_iteration": 2.750502347946167 }, { "auxiliary_loss_clip": 0.01550996, "auxiliary_loss_mlp": 0.01140204, "balance_loss_clip": 1.19136214, "balance_loss_mlp": 1.07430482, "epoch": 0.006613557793476627, "flos": 21105132445440.0, "grad_norm": 1.9540987754213832, "language_loss": 0.84243041, "learning_rate": 3.0264145470332218e-06, "loss": 0.86934245, "num_input_tokens_seen": 2337820, "step": 110, "time_per_iteration": 2.82443904876709 }, { "auxiliary_loss_clip": 0.01544634, "auxiliary_loss_mlp": 0.01151549, "balance_loss_clip": 1.18396342, "balance_loss_mlp": 1.08493507, "epoch": 0.006673681046144597, "flos": 26030747543040.0, "grad_norm": 2.4580319150483563, "language_loss": 0.82940048, "learning_rate": 3.032241303393073e-06, "loss": 0.85636234, "num_input_tokens_seen": 2358560, "step": 111, "time_per_iteration": 2.8308968544006348 }, { "auxiliary_loss_clip": 0.0154596, "auxiliary_loss_mlp": 0.01133366, "balance_loss_clip": 1.18776846, "balance_loss_mlp": 1.06970847, "epoch": 0.006733804298812566, "flos": 23147767163520.0, "grad_norm": 2.356589096997363, "language_loss": 0.93989801, "learning_rate": 3.0380158011446e-06, "loss": 0.9666912, "num_input_tokens_seen": 2379005, "step": 112, "time_per_iteration": 2.8007922172546387 }, { "auxiliary_loss_clip": 0.01549647, "auxiliary_loss_mlp": 0.01136979, "balance_loss_clip": 1.18394601, "balance_loss_mlp": 1.07322621, "epoch": 0.006793927551480535, "flos": 11764444210560.0, "grad_norm": 2.521639841990545, "language_loss": 0.79509294, "learning_rate": 3.0437389693482466e-06, "loss": 0.82195914, "num_input_tokens_seen": 2395610, "step": 113, "time_per_iteration": 2.7599966526031494 }, { "auxiliary_loss_clip": 0.0153736, "auxiliary_loss_mlp": 0.01131524, "balance_loss_clip": 1.18028498, "balance_loss_mlp": 1.06562555, "epoch": 0.006854050804148504, "flos": 19171953446400.0, "grad_norm": 2.343117351168218, "language_loss": 0.93439317, "learning_rate": 3.0494117125071475e-06, "loss": 0.96108204, "num_input_tokens_seen": 2415005, "step": 114, "time_per_iteration": 2.723540782928467 }, { "auxiliary_loss_clip": 0.01544971, "auxiliary_loss_mlp": 0.01138932, "balance_loss_clip": 1.17997146, "balance_loss_mlp": 1.07918465, "epoch": 0.006914174056816474, "flos": 21981891519360.0, "grad_norm": 1.9509019191057126, "language_loss": 0.9463321, "learning_rate": 3.055034911425055e-06, "loss": 0.97317111, "num_input_tokens_seen": 2433965, "step": 115, "time_per_iteration": 2.7077698707580566 }, { "auxiliary_loss_clip": 0.01537699, "auxiliary_loss_mlp": 0.01118178, "balance_loss_clip": 1.17675614, "balance_loss_mlp": 1.05151677, "epoch": 0.006974297309484443, "flos": 16289152634880.0, "grad_norm": 10.363795807176915, "language_loss": 0.82148951, "learning_rate": 3.0606094240271244e-06, "loss": 0.84804827, "num_input_tokens_seen": 2451605, "step": 116, "time_per_iteration": 2.681190013885498 }, { "auxiliary_loss_clip": 0.01528803, "auxiliary_loss_mlp": 0.01126189, "balance_loss_clip": 1.17677391, "balance_loss_mlp": 1.06219721, "epoch": 0.007034420562152413, "flos": 26104005331200.0, "grad_norm": 2.4150591879391627, "language_loss": 0.88368428, "learning_rate": 3.0661360861454656e-06, "loss": 0.91023421, "num_input_tokens_seen": 2472035, "step": 117, "time_per_iteration": 2.776143789291382 }, { "auxiliary_loss_clip": 0.01527909, "auxiliary_loss_mlp": 0.01146127, "balance_loss_clip": 1.17495561, "balance_loss_mlp": 1.08041906, "epoch": 0.007094543814820382, "flos": 14204609723520.0, "grad_norm": 2.3639764059040265, "language_loss": 0.8454417, "learning_rate": 3.071615712271274e-06, "loss": 0.87218207, "num_input_tokens_seen": 2489285, "step": 118, "time_per_iteration": 2.7110469341278076 }, { "auxiliary_loss_clip": 0.01538161, "auxiliary_loss_mlp": 0.01163868, "balance_loss_clip": 1.1759789, "balance_loss_mlp": 1.0984937, "epoch": 0.007154667067488351, "flos": 14976007228800.0, "grad_norm": 2.231843342078736, "language_loss": 0.99319011, "learning_rate": 3.0770490962752172e-06, "loss": 1.02021039, "num_input_tokens_seen": 2506460, "step": 119, "time_per_iteration": 2.674121856689453 }, { "auxiliary_loss_clip": 0.01540018, "auxiliary_loss_mlp": 0.01120611, "balance_loss_clip": 1.17242217, "balance_loss_mlp": 1.05738258, "epoch": 0.00721479032015632, "flos": 20193288762240.0, "grad_norm": 2.7981733983226764, "language_loss": 0.8963809, "learning_rate": 3.082437012097686e-06, "loss": 0.92298722, "num_input_tokens_seen": 2525565, "step": 120, "time_per_iteration": 2.745962381362915 }, { "auxiliary_loss_clip": 0.01524916, "auxiliary_loss_mlp": 0.01129465, "balance_loss_clip": 1.1734432, "balance_loss_mlp": 1.06513989, "epoch": 0.00727491357282429, "flos": 23147228459520.0, "grad_norm": 1.797716104424251, "language_loss": 0.93491542, "learning_rate": 3.0877802144103967e-06, "loss": 0.96145928, "num_input_tokens_seen": 2546605, "step": 121, "time_per_iteration": 2.7924466133117676 }, { "auxiliary_loss_clip": 0.01526294, "auxiliary_loss_mlp": 0.0114832, "balance_loss_clip": 1.17395604, "balance_loss_mlp": 1.08490098, "epoch": 0.007335036825492259, "flos": 15521669712000.0, "grad_norm": 2.3704869501778285, "language_loss": 0.90462255, "learning_rate": 3.09307943925077e-06, "loss": 0.93136871, "num_input_tokens_seen": 2560730, "step": 122, "time_per_iteration": 2.930413246154785 }, { "auxiliary_loss_clip": 0.01521826, "auxiliary_loss_mlp": 0.01146566, "balance_loss_clip": 1.1681807, "balance_loss_mlp": 1.07861674, "epoch": 0.007395160078160229, "flos": 24243365848320.0, "grad_norm": 2.4867163179710037, "language_loss": 0.92660481, "learning_rate": 3.0983354046304154e-06, "loss": 0.95328873, "num_input_tokens_seen": 2579550, "step": 123, "time_per_iteration": 2.7484309673309326 }, { "auxiliary_loss_clip": 0.01519363, "auxiliary_loss_mlp": 0.01127611, "balance_loss_clip": 1.16324139, "balance_loss_mlp": 1.0651449, "epoch": 0.007455283330828198, "flos": 31759792099200.0, "grad_norm": 2.366639004459226, "language_loss": 0.71187961, "learning_rate": 3.103548811118979e-06, "loss": 0.73834932, "num_input_tokens_seen": 2600390, "step": 124, "time_per_iteration": 2.8419976234436035 }, { "auxiliary_loss_clip": 0.01506936, "auxiliary_loss_mlp": 0.01125571, "balance_loss_clip": 1.16464007, "balance_loss_mlp": 1.06167519, "epoch": 0.007515406583496167, "flos": 26615157822720.0, "grad_norm": 2.1632751269766106, "language_loss": 0.88450015, "learning_rate": 3.108720342404542e-06, "loss": 0.91082525, "num_input_tokens_seen": 2620770, "step": 125, "time_per_iteration": 2.823296308517456 }, { "auxiliary_loss_clip": 0.01522239, "auxiliary_loss_mlp": 0.01142214, "balance_loss_clip": 1.16456664, "balance_loss_mlp": 1.07912827, "epoch": 0.007575529836164136, "flos": 18223696350720.0, "grad_norm": 2.6632616920164067, "language_loss": 0.82381976, "learning_rate": 3.1138506658316945e-06, "loss": 0.85046428, "num_input_tokens_seen": 2639900, "step": 126, "time_per_iteration": 2.7325809001922607 }, { "auxiliary_loss_clip": 0.015153, "auxiliary_loss_mlp": 0.01142869, "balance_loss_clip": 1.16330886, "balance_loss_mlp": 1.08088017, "epoch": 0.007635653088832106, "flos": 21580410228480.0, "grad_norm": 3.925284628341409, "language_loss": 0.6743899, "learning_rate": 3.1189404329183404e-06, "loss": 0.7009716, "num_input_tokens_seen": 2657450, "step": 127, "time_per_iteration": 2.709821939468384 }, { "auxiliary_loss_clip": 0.01503057, "auxiliary_loss_mlp": 0.01132416, "balance_loss_clip": 1.165169, "balance_loss_mlp": 1.06861567, "epoch": 0.007695776341500075, "flos": 25375054723200.0, "grad_norm": 2.0535131533503734, "language_loss": 0.8819322, "learning_rate": 3.1239902798522317e-06, "loss": 0.90828693, "num_input_tokens_seen": 2678150, "step": 128, "time_per_iteration": 2.764707565307617 }, { "auxiliary_loss_clip": 0.01505955, "auxiliary_loss_mlp": 0.01144223, "balance_loss_clip": 1.16043079, "balance_loss_mlp": 1.08042252, "epoch": 0.007755899594168045, "flos": 22343906741760.0, "grad_norm": 2.6427711693827005, "language_loss": 0.84719259, "learning_rate": 3.129000827968184e-06, "loss": 0.87369436, "num_input_tokens_seen": 2698290, "step": 129, "time_per_iteration": 2.7472774982452393 }, { "auxiliary_loss_clip": 0.01497871, "auxiliary_loss_mlp": 0.01130211, "balance_loss_clip": 1.15871263, "balance_loss_mlp": 1.06655347, "epoch": 0.007816022846836013, "flos": 22638230784000.0, "grad_norm": 2.366492959329914, "language_loss": 0.97564614, "learning_rate": 3.133972684206866e-06, "loss": 1.00192702, "num_input_tokens_seen": 2717630, "step": 130, "time_per_iteration": 2.6955018043518066 }, { "auxiliary_loss_clip": 0.01492272, "auxiliary_loss_mlp": 0.01134965, "balance_loss_clip": 1.15630865, "balance_loss_mlp": 1.06987715, "epoch": 0.007876146099503984, "flos": 18182901479040.0, "grad_norm": 2.2164470079204572, "language_loss": 0.82658112, "learning_rate": 3.138906441556014e-06, "loss": 0.85285342, "num_input_tokens_seen": 2735835, "step": 131, "time_per_iteration": 2.722247362136841 }, { "auxiliary_loss_clip": 0.01500937, "auxiliary_loss_mlp": 0.01128359, "balance_loss_clip": 1.15885806, "balance_loss_mlp": 1.06694245, "epoch": 0.007936269352171952, "flos": 27119486730240.0, "grad_norm": 2.7663180664822193, "language_loss": 0.82781422, "learning_rate": 3.143802679474861e-06, "loss": 0.85410714, "num_input_tokens_seen": 2756335, "step": 132, "time_per_iteration": 2.7937612533569336 }, { "auxiliary_loss_clip": 0.01491919, "auxiliary_loss_mlp": 0.01128624, "balance_loss_clip": 1.15346444, "balance_loss_mlp": 1.0664922, "epoch": 0.007996392604839923, "flos": 19026335710080.0, "grad_norm": 2.182366740159355, "language_loss": 0.95499313, "learning_rate": 3.1486619643025565e-06, "loss": 0.98119843, "num_input_tokens_seen": 2775090, "step": 133, "time_per_iteration": 2.7380354404449463 }, { "auxiliary_loss_clip": 0.01487746, "auxiliary_loss_mlp": 0.0112871, "balance_loss_clip": 1.16170454, "balance_loss_mlp": 1.06843781, "epoch": 0.008056515857507891, "flos": 25484151306240.0, "grad_norm": 1.8164116645967854, "language_loss": 0.73478442, "learning_rate": 3.153484849651286e-06, "loss": 0.76094896, "num_input_tokens_seen": 2795320, "step": 134, "time_per_iteration": 2.7483408451080322 }, { "auxiliary_loss_clip": 0.01484621, "auxiliary_loss_mlp": 0.01132134, "balance_loss_clip": 1.15115011, "balance_loss_mlp": 1.06695068, "epoch": 0.00811663911017586, "flos": 20557566541440.0, "grad_norm": 5.027018494085059, "language_loss": 0.88792509, "learning_rate": 3.1582718767847806e-06, "loss": 0.91409266, "num_input_tokens_seen": 2812815, "step": 135, "time_per_iteration": 2.6838128566741943 }, { "auxiliary_loss_clip": 0.01487119, "auxiliary_loss_mlp": 0.0113257, "balance_loss_clip": 1.15490174, "balance_loss_mlp": 1.06714821, "epoch": 0.00817676236284383, "flos": 18799738761600.0, "grad_norm": 1.9282722528396903, "language_loss": 0.89138198, "learning_rate": 3.1630235749828485e-06, "loss": 0.91757882, "num_input_tokens_seen": 2830445, "step": 136, "time_per_iteration": 2.726475238800049 }, { "auxiliary_loss_clip": 0.01483417, "auxiliary_loss_mlp": 0.01110724, "balance_loss_clip": 1.1494019, "balance_loss_mlp": 1.05078554, "epoch": 0.008236885615511799, "flos": 23873593288320.0, "grad_norm": 2.2984339413846078, "language_loss": 0.84091324, "learning_rate": 3.1677404618925676e-06, "loss": 0.86685467, "num_input_tokens_seen": 2846965, "step": 137, "time_per_iteration": 7.4708640575408936 }, { "auxiliary_loss_clip": 0.01481848, "auxiliary_loss_mlp": 0.01118837, "balance_loss_clip": 1.1500535, "balance_loss_mlp": 1.05894589, "epoch": 0.00829700886817977, "flos": 24643626076800.0, "grad_norm": 1.69378413504035, "language_loss": 0.9018681, "learning_rate": 3.1724230438666953e-06, "loss": 0.92787492, "num_input_tokens_seen": 2867520, "step": 138, "time_per_iteration": 4.311830520629883 }, { "auxiliary_loss_clip": 0.01469655, "auxiliary_loss_mlp": 0.01123604, "balance_loss_clip": 1.14824438, "balance_loss_mlp": 1.05904007, "epoch": 0.008357132120847738, "flos": 25262007644160.0, "grad_norm": 2.1515203004813785, "language_loss": 0.91478992, "learning_rate": 3.177071816289865e-06, "loss": 0.94072247, "num_input_tokens_seen": 2885675, "step": 139, "time_per_iteration": 2.7678122520446777 }, { "auxiliary_loss_clip": 0.01486799, "auxiliary_loss_mlp": 0.01124947, "balance_loss_clip": 1.15521085, "balance_loss_mlp": 1.06195688, "epoch": 0.008417255373515706, "flos": 27344898529920.0, "grad_norm": 2.305315677890536, "language_loss": 0.85667789, "learning_rate": 3.181687263893095e-06, "loss": 0.88279533, "num_input_tokens_seen": 2905960, "step": 140, "time_per_iteration": 2.8557639122009277 }, { "auxiliary_loss_clip": 0.01473538, "auxiliary_loss_mlp": 0.01122701, "balance_loss_clip": 1.14923954, "balance_loss_mlp": 1.06166625, "epoch": 0.008477378626183677, "flos": 17639070589440.0, "grad_norm": 2.3443620963590455, "language_loss": 0.84346074, "learning_rate": 3.186269861057098e-06, "loss": 0.86942315, "num_input_tokens_seen": 2922780, "step": 141, "time_per_iteration": 2.7656807899475098 }, { "auxiliary_loss_clip": 0.01477141, "auxiliary_loss_mlp": 0.01135217, "balance_loss_clip": 1.14718878, "balance_loss_mlp": 1.07360983, "epoch": 0.008537501878851645, "flos": 13881342297600.0, "grad_norm": 2.29020652115343, "language_loss": 0.8105557, "learning_rate": 3.1908200721048745e-06, "loss": 0.83667928, "num_input_tokens_seen": 2938765, "step": 142, "time_per_iteration": 2.747598171234131 }, { "auxiliary_loss_clip": 0.01378886, "auxiliary_loss_mlp": 0.01060004, "balance_loss_clip": 1.19240355, "balance_loss_mlp": 1.03406358, "epoch": 0.008597625131519616, "flos": 71248101281280.0, "grad_norm": 1.056887207538052, "language_loss": 0.66899812, "learning_rate": 3.195338351584042e-06, "loss": 0.69338703, "num_input_tokens_seen": 3006665, "step": 143, "time_per_iteration": 3.346982002258301 }, { "auxiliary_loss_clip": 0.01467707, "auxiliary_loss_mlp": 0.01123721, "balance_loss_clip": 1.14666772, "balance_loss_mlp": 1.06273365, "epoch": 0.008657748384187584, "flos": 17602836744960.0, "grad_norm": 2.6467048454978523, "language_loss": 0.84356761, "learning_rate": 3.1998251445393258e-06, "loss": 0.86948192, "num_input_tokens_seen": 3024335, "step": 144, "time_per_iteration": 2.762087345123291 }, { "auxiliary_loss_clip": 0.01455701, "auxiliary_loss_mlp": 0.01114511, "balance_loss_clip": 1.14058816, "balance_loss_mlp": 1.05085373, "epoch": 0.008717871636855555, "flos": 19715317459200.0, "grad_norm": 1.8692883316747366, "language_loss": 0.88353741, "learning_rate": 3.204280886775619e-06, "loss": 0.90923953, "num_input_tokens_seen": 3043300, "step": 145, "time_per_iteration": 2.7050039768218994 }, { "auxiliary_loss_clip": 0.01470385, "auxiliary_loss_mlp": 0.01121817, "balance_loss_clip": 1.14247775, "balance_loss_mlp": 1.05873132, "epoch": 0.008777994889523523, "flos": 24717422568960.0, "grad_norm": 1.860830881508538, "language_loss": 0.86182559, "learning_rate": 3.208706005112005e-06, "loss": 0.88774765, "num_input_tokens_seen": 3064610, "step": 146, "time_per_iteration": 2.741013288497925 }, { "auxiliary_loss_clip": 0.01356998, "auxiliary_loss_mlp": 0.01029681, "balance_loss_clip": 1.18072379, "balance_loss_mlp": 1.00431335, "epoch": 0.008838118142191492, "flos": 70132067758080.0, "grad_norm": 0.8598047517885464, "language_loss": 0.60122073, "learning_rate": 3.213100917627104e-06, "loss": 0.6250875, "num_input_tokens_seen": 3130385, "step": 147, "time_per_iteration": 3.27382230758667 }, { "auxiliary_loss_clip": 0.01463009, "auxiliary_loss_mlp": 0.01123472, "balance_loss_clip": 1.14658976, "balance_loss_mlp": 1.06548882, "epoch": 0.008898241394859462, "flos": 20044797937920.0, "grad_norm": 1.8116070485228748, "language_loss": 0.84620225, "learning_rate": 3.2174660338961135e-06, "loss": 0.87206709, "num_input_tokens_seen": 3149760, "step": 148, "time_per_iteration": 2.72910475730896 }, { "auxiliary_loss_clip": 0.01466623, "auxiliary_loss_mlp": 0.01144944, "balance_loss_clip": 1.14777792, "balance_loss_mlp": 1.07985532, "epoch": 0.008958364647527431, "flos": 10743611685120.0, "grad_norm": 2.5530775415688205, "language_loss": 0.88680327, "learning_rate": 3.2218017552198588e-06, "loss": 0.91291893, "num_input_tokens_seen": 3164500, "step": 149, "time_per_iteration": 2.688528537750244 }, { "auxiliary_loss_clip": 0.01463954, "auxiliary_loss_mlp": 0.01114885, "balance_loss_clip": 1.14290714, "balance_loss_mlp": 1.05728304, "epoch": 0.009018487900195401, "flos": 29127467802240.0, "grad_norm": 2.1996557200804823, "language_loss": 0.93269086, "learning_rate": 3.226108474846181e-06, "loss": 0.95847929, "num_input_tokens_seen": 3182455, "step": 150, "time_per_iteration": 2.7901580333709717 }, { "auxiliary_loss_clip": 0.01450819, "auxiliary_loss_mlp": 0.01114571, "balance_loss_clip": 1.13812149, "balance_loss_mlp": 1.05839944, "epoch": 0.00907861115286337, "flos": 32963661354240.0, "grad_norm": 4.690239135210318, "language_loss": 0.7421813, "learning_rate": 3.2303865781839817e-06, "loss": 0.7678352, "num_input_tokens_seen": 3203995, "step": 151, "time_per_iteration": 2.79590106010437 }, { "auxiliary_loss_clip": 0.01463077, "auxiliary_loss_mlp": 0.01128244, "balance_loss_clip": 1.14311624, "balance_loss_mlp": 1.06954527, "epoch": 0.009138734405531338, "flos": 21762441377280.0, "grad_norm": 4.291097242497492, "language_loss": 0.88460332, "learning_rate": 3.234636443010188e-06, "loss": 0.9105165, "num_input_tokens_seen": 3222575, "step": 152, "time_per_iteration": 2.701775550842285 }, { "auxiliary_loss_clip": 0.01462099, "auxiliary_loss_mlp": 0.01122264, "balance_loss_clip": 1.14743185, "balance_loss_mlp": 1.06275451, "epoch": 0.009198857658199309, "flos": 20842517134080.0, "grad_norm": 3.861411936226758, "language_loss": 0.83918798, "learning_rate": 3.238858439669943e-06, "loss": 0.8650316, "num_input_tokens_seen": 3240180, "step": 153, "time_per_iteration": 2.730654716491699 }, { "auxiliary_loss_clip": 0.01453756, "auxiliary_loss_mlp": 0.01136244, "balance_loss_clip": 1.14024806, "balance_loss_mlp": 1.07554269, "epoch": 0.009258980910867277, "flos": 24827381078400.0, "grad_norm": 1.8788427995178905, "language_loss": 0.89924759, "learning_rate": 3.2430529312702712e-06, "loss": 0.92514759, "num_input_tokens_seen": 3259800, "step": 154, "time_per_iteration": 2.8150386810302734 }, { "auxiliary_loss_clip": 0.01457041, "auxiliary_loss_mlp": 0.01148182, "balance_loss_clip": 1.1422174, "balance_loss_mlp": 1.08934021, "epoch": 0.009319104163535248, "flos": 28767786963840.0, "grad_norm": 2.155148564981828, "language_loss": 0.89730597, "learning_rate": 3.2472202738674737e-06, "loss": 0.9233582, "num_input_tokens_seen": 3280400, "step": 155, "time_per_iteration": 2.7780215740203857 }, { "auxiliary_loss_clip": 0.01462257, "auxiliary_loss_mlp": 0.01115972, "balance_loss_clip": 1.14140153, "balance_loss_mlp": 1.0580368, "epoch": 0.009379227416203216, "flos": 16582004219520.0, "grad_norm": 2.6722626388977986, "language_loss": 0.86758631, "learning_rate": 3.2513608166485063e-06, "loss": 0.8933686, "num_input_tokens_seen": 3297600, "step": 156, "time_per_iteration": 2.7195818424224854 }, { "auxiliary_loss_clip": 0.01460326, "auxiliary_loss_mlp": 0.01116019, "balance_loss_clip": 1.14530039, "balance_loss_mlp": 1.05770147, "epoch": 0.009439350668871187, "flos": 18329919845760.0, "grad_norm": 2.3212743339319926, "language_loss": 0.99652225, "learning_rate": 3.2554749021065498e-06, "loss": 1.0222857, "num_input_tokens_seen": 3313635, "step": 157, "time_per_iteration": 2.7530624866485596 }, { "auxiliary_loss_clip": 0.01445494, "auxiliary_loss_mlp": 0.01139991, "balance_loss_clip": 1.14011836, "balance_loss_mlp": 1.08162606, "epoch": 0.009499473921539155, "flos": 24349912565760.0, "grad_norm": 2.2650385025378834, "language_loss": 0.88388717, "learning_rate": 3.2595628662110186e-06, "loss": 0.90974212, "num_input_tokens_seen": 3333735, "step": 158, "time_per_iteration": 2.744640588760376 }, { "auxiliary_loss_clip": 0.01451838, "auxiliary_loss_mlp": 0.01122147, "balance_loss_clip": 1.13977575, "balance_loss_mlp": 1.0630666, "epoch": 0.009559597174207124, "flos": 16399326625920.0, "grad_norm": 2.1807440045696165, "language_loss": 0.86407602, "learning_rate": 3.2636250385721982e-06, "loss": 0.88981581, "num_input_tokens_seen": 3348800, "step": 159, "time_per_iteration": 2.7330005168914795 }, { "auxiliary_loss_clip": 0.01441743, "auxiliary_loss_mlp": 0.01137796, "balance_loss_clip": 1.13474953, "balance_loss_mlp": 1.07752383, "epoch": 0.009619720426875094, "flos": 22856890826880.0, "grad_norm": 1.7296815250329798, "language_loss": 0.86756837, "learning_rate": 3.2676617426007263e-06, "loss": 0.89336377, "num_input_tokens_seen": 3368595, "step": 160, "time_per_iteration": 2.844817876815796 }, { "auxiliary_loss_clip": 0.01447614, "auxiliary_loss_mlp": 0.0112266, "balance_loss_clip": 1.13978457, "balance_loss_mlp": 1.06725168, "epoch": 0.009679843679543063, "flos": 19135001329920.0, "grad_norm": 2.462408333273543, "language_loss": 0.91543746, "learning_rate": 3.2716732956621042e-06, "loss": 0.94114017, "num_input_tokens_seen": 3384975, "step": 161, "time_per_iteration": 2.667666435241699 }, { "auxiliary_loss_clip": 0.01453392, "auxiliary_loss_mlp": 0.01111804, "balance_loss_clip": 1.14104879, "balance_loss_mlp": 1.05610919, "epoch": 0.009739966932211033, "flos": 20302995876480.0, "grad_norm": 1.7914334411859298, "language_loss": 0.91582954, "learning_rate": 3.2756600092264203e-06, "loss": 0.94148147, "num_input_tokens_seen": 3404755, "step": 162, "time_per_iteration": 2.6779961585998535 }, { "auxiliary_loss_clip": 0.0131522, "auxiliary_loss_mlp": 0.01056953, "balance_loss_clip": 1.15019548, "balance_loss_mlp": 1.03358769, "epoch": 0.009800090184879002, "flos": 67034234177280.0, "grad_norm": 1.183297200633083, "language_loss": 0.72292268, "learning_rate": 3.279622189013474e-06, "loss": 0.74664438, "num_input_tokens_seen": 3467210, "step": 163, "time_per_iteration": 3.226755142211914 }, { "auxiliary_loss_clip": 0.01439788, "auxiliary_loss_mlp": 0.01116102, "balance_loss_clip": 1.13873029, "balance_loss_mlp": 1.05921507, "epoch": 0.00986021343754697, "flos": 17164690646400.0, "grad_norm": 3.3372881081540937, "language_loss": 0.84684807, "learning_rate": 3.283560135133457e-06, "loss": 0.87240696, "num_input_tokens_seen": 3483220, "step": 164, "time_per_iteration": 2.768935203552246 }, { "auxiliary_loss_clip": 0.01430933, "auxiliary_loss_mlp": 0.0110117, "balance_loss_clip": 1.13048434, "balance_loss_mlp": 1.04533219, "epoch": 0.00992033669021494, "flos": 17749424148480.0, "grad_norm": 4.079659732294038, "language_loss": 0.89080763, "learning_rate": 3.2874741422233565e-06, "loss": 0.91612864, "num_input_tokens_seen": 3501465, "step": 165, "time_per_iteration": 2.673292875289917 }, { "auxiliary_loss_clip": 0.01433192, "auxiliary_loss_mlp": 0.01128138, "balance_loss_clip": 1.13111067, "balance_loss_mlp": 1.06819916, "epoch": 0.00998045994288291, "flos": 25297164080640.0, "grad_norm": 1.7359539169577796, "language_loss": 0.79931343, "learning_rate": 3.2913644995792465e-06, "loss": 0.82492673, "num_input_tokens_seen": 3520480, "step": 166, "time_per_iteration": 2.762742757797241 }, { "auxiliary_loss_clip": 0.01438026, "auxiliary_loss_mlp": 0.01129718, "balance_loss_clip": 1.13488948, "balance_loss_mlp": 1.07066131, "epoch": 0.01004058319555088, "flos": 32298954220800.0, "grad_norm": 2.3252666324684585, "language_loss": 0.92125285, "learning_rate": 3.2952314912845914e-06, "loss": 0.94693023, "num_input_tokens_seen": 3539570, "step": 167, "time_per_iteration": 2.970964193344116 }, { "auxiliary_loss_clip": 0.01429698, "auxiliary_loss_mlp": 0.01133324, "balance_loss_clip": 1.13294363, "balance_loss_mlp": 1.07734346, "epoch": 0.010100706448218848, "flos": 11319941404800.0, "grad_norm": 13.512238716069085, "language_loss": 0.90781063, "learning_rate": 3.299075396334735e-06, "loss": 0.93344086, "num_input_tokens_seen": 3555465, "step": 168, "time_per_iteration": 2.8039841651916504 }, { "auxiliary_loss_clip": 0.01424367, "auxiliary_loss_mlp": 0.01104795, "balance_loss_clip": 1.12848639, "balance_loss_mlp": 1.04700291, "epoch": 0.010160829700886819, "flos": 29719491765120.0, "grad_norm": 1.6705351130563955, "language_loss": 0.87173021, "learning_rate": 3.3028964887576868e-06, "loss": 0.89702177, "num_input_tokens_seen": 3578970, "step": 169, "time_per_iteration": 2.8215444087982178 }, { "auxiliary_loss_clip": 0.01425902, "auxiliary_loss_mlp": 0.01110538, "balance_loss_clip": 1.13139379, "balance_loss_mlp": 1.05317438, "epoch": 0.010220952953554787, "flos": 20412343854720.0, "grad_norm": 1.7404257397879006, "language_loss": 0.84622329, "learning_rate": 3.306695037731344e-06, "loss": 0.87158769, "num_input_tokens_seen": 3597275, "step": 170, "time_per_iteration": 2.6759181022644043 }, { "auxiliary_loss_clip": 0.0143612, "auxiliary_loss_mlp": 0.01137162, "balance_loss_clip": 1.13149834, "balance_loss_mlp": 1.07874942, "epoch": 0.010281076206222756, "flos": 31285124847360.0, "grad_norm": 2.174517661608974, "language_loss": 0.89936447, "learning_rate": 3.3104713076972827e-06, "loss": 0.92509729, "num_input_tokens_seen": 3618905, "step": 171, "time_per_iteration": 2.800394058227539 }, { "auxiliary_loss_clip": 0.01430673, "auxiliary_loss_mlp": 0.01108779, "balance_loss_clip": 1.1347487, "balance_loss_mlp": 1.05382347, "epoch": 0.010341199458890726, "flos": 21982286568960.0, "grad_norm": 1.938241860949196, "language_loss": 0.88895655, "learning_rate": 3.314225558471224e-06, "loss": 0.91435111, "num_input_tokens_seen": 3639610, "step": 172, "time_per_iteration": 2.755190849304199 }, { "auxiliary_loss_clip": 0.01418638, "auxiliary_loss_mlp": 0.01118471, "balance_loss_clip": 1.12744904, "balance_loss_mlp": 1.06270456, "epoch": 0.010401322711558695, "flos": 30810529422720.0, "grad_norm": 1.7925778946034159, "language_loss": 0.80943549, "learning_rate": 3.317958045350308e-06, "loss": 0.83480656, "num_input_tokens_seen": 3664030, "step": 173, "time_per_iteration": 2.751945734024048 }, { "auxiliary_loss_clip": 0.01429615, "auxiliary_loss_mlp": 0.01107965, "balance_loss_clip": 1.13108575, "balance_loss_mlp": 1.05534625, "epoch": 0.010461445964226665, "flos": 24715124098560.0, "grad_norm": 2.1644843911099216, "language_loss": 0.82763064, "learning_rate": 3.3216690192172596e-06, "loss": 0.85300648, "num_input_tokens_seen": 3683615, "step": 174, "time_per_iteration": 2.676630735397339 }, { "auxiliary_loss_clip": 0.01423443, "auxiliary_loss_mlp": 0.01120976, "balance_loss_clip": 1.12816644, "balance_loss_mlp": 1.06523335, "epoch": 0.010521569216894634, "flos": 27710361457920.0, "grad_norm": 2.331494685324117, "language_loss": 0.72837007, "learning_rate": 3.325358726641591e-06, "loss": 0.75381434, "num_input_tokens_seen": 3704540, "step": 175, "time_per_iteration": 2.6876866817474365 }, { "auxiliary_loss_clip": 0.01425333, "auxiliary_loss_mlp": 0.01127215, "balance_loss_clip": 1.12866652, "balance_loss_mlp": 1.06980324, "epoch": 0.010581692469562603, "flos": 12458346122880.0, "grad_norm": 4.811985773634618, "language_loss": 0.97983754, "learning_rate": 3.329027409977902e-06, "loss": 1.00536299, "num_input_tokens_seen": 3721320, "step": 176, "time_per_iteration": 2.8159937858581543 }, { "auxiliary_loss_clip": 0.0141033, "auxiliary_loss_mlp": 0.01130651, "balance_loss_clip": 1.12546706, "balance_loss_mlp": 1.07738805, "epoch": 0.010641815722230573, "flos": 19427601519360.0, "grad_norm": 2.8326118759658585, "language_loss": 0.76926064, "learning_rate": 3.3326753074614087e-06, "loss": 0.7946704, "num_input_tokens_seen": 3739385, "step": 177, "time_per_iteration": 5.7707555294036865 }, { "auxiliary_loss_clip": 0.01421858, "auxiliary_loss_mlp": 0.01104718, "balance_loss_clip": 1.12455702, "balance_loss_mlp": 1.05002475, "epoch": 0.010701938974898541, "flos": 18332577452160.0, "grad_norm": 2.6517911185675014, "language_loss": 0.76942402, "learning_rate": 3.3363026533007716e-06, "loss": 0.79468977, "num_input_tokens_seen": 3756360, "step": 178, "time_per_iteration": 4.337082386016846 }, { "auxiliary_loss_clip": 0.01430293, "auxiliary_loss_mlp": 0.01109414, "balance_loss_clip": 1.1303575, "balance_loss_mlp": 1.05252683, "epoch": 0.010762062227566512, "flos": 19203985399680.0, "grad_norm": 2.6843360372821925, "language_loss": 0.84022826, "learning_rate": 3.3399096777683303e-06, "loss": 0.86562538, "num_input_tokens_seen": 3773930, "step": 179, "time_per_iteration": 2.6826629638671875 }, { "auxiliary_loss_clip": 0.01418094, "auxiliary_loss_mlp": 0.01108667, "balance_loss_clip": 1.12202275, "balance_loss_mlp": 1.05158973, "epoch": 0.01082218548023448, "flos": 31425427370880.0, "grad_norm": 2.0256655839140083, "language_loss": 0.83674574, "learning_rate": 3.3434966072878213e-06, "loss": 0.86201334, "num_input_tokens_seen": 3793630, "step": 180, "time_per_iteration": 2.7483785152435303 }, { "auxiliary_loss_clip": 0.01421326, "auxiliary_loss_mlp": 0.01120347, "balance_loss_clip": 1.12740374, "balance_loss_mlp": 1.0646286, "epoch": 0.01088230873290245, "flos": 25046436170880.0, "grad_norm": 3.253139118534122, "language_loss": 0.77958715, "learning_rate": 3.3470636645196674e-06, "loss": 0.80500388, "num_input_tokens_seen": 3813610, "step": 181, "time_per_iteration": 2.698941469192505 }, { "auxiliary_loss_clip": 0.01414948, "auxiliary_loss_mlp": 0.01130231, "balance_loss_clip": 1.12188053, "balance_loss_mlp": 1.07577634, "epoch": 0.01094243198557042, "flos": 22893411980160.0, "grad_norm": 2.56637338396407, "language_loss": 0.76438594, "learning_rate": 3.3506110684439156e-06, "loss": 0.78983772, "num_input_tokens_seen": 3831390, "step": 182, "time_per_iteration": 2.6951375007629395 }, { "auxiliary_loss_clip": 0.01412526, "auxiliary_loss_mlp": 0.01126665, "balance_loss_clip": 1.12167537, "balance_loss_mlp": 1.0702554, "epoch": 0.011002555238238388, "flos": 17165049782400.0, "grad_norm": 2.083158831639218, "language_loss": 0.87484097, "learning_rate": 3.3541390344409054e-06, "loss": 0.90023291, "num_input_tokens_seen": 3849705, "step": 183, "time_per_iteration": 2.733753204345703 }, { "auxiliary_loss_clip": 0.01415922, "auxiliary_loss_mlp": 0.01110585, "balance_loss_clip": 1.12529624, "balance_loss_mlp": 1.05922985, "epoch": 0.011062678490906358, "flos": 22310150935680.0, "grad_norm": 3.105080129831269, "language_loss": 0.86911464, "learning_rate": 3.357647774369736e-06, "loss": 0.89437973, "num_input_tokens_seen": 3869230, "step": 184, "time_per_iteration": 2.6783828735351562 }, { "auxiliary_loss_clip": 0.01410648, "auxiliary_loss_mlp": 0.01108321, "balance_loss_clip": 1.12499499, "balance_loss_mlp": 1.05203021, "epoch": 0.011122801743574327, "flos": 24388373053440.0, "grad_norm": 1.8650514063709744, "language_loss": 0.83885491, "learning_rate": 3.3611374966446085e-06, "loss": 0.86404455, "num_input_tokens_seen": 3889735, "step": 185, "time_per_iteration": 2.6863327026367188 }, { "auxiliary_loss_clip": 0.01419384, "auxiliary_loss_mlp": 0.01107812, "balance_loss_clip": 1.12355363, "balance_loss_mlp": 1.04999495, "epoch": 0.011182924996242297, "flos": 18150258994560.0, "grad_norm": 2.8933407749520743, "language_loss": 0.71027243, "learning_rate": 3.3646084063091142e-06, "loss": 0.73554444, "num_input_tokens_seen": 3908855, "step": 186, "time_per_iteration": 2.819805383682251 }, { "auxiliary_loss_clip": 0.01415699, "auxiliary_loss_mlp": 0.01108312, "balance_loss_clip": 1.12262082, "balance_loss_mlp": 1.05574071, "epoch": 0.011243048248910266, "flos": 15486800584320.0, "grad_norm": 2.4244794785226733, "language_loss": 1.01999915, "learning_rate": 3.3680607051085194e-06, "loss": 1.04523933, "num_input_tokens_seen": 3923865, "step": 187, "time_per_iteration": 2.65875506401062 }, { "auxiliary_loss_clip": 0.01404987, "auxiliary_loss_mlp": 0.01107995, "balance_loss_clip": 1.12269068, "balance_loss_mlp": 1.05253887, "epoch": 0.011303171501578235, "flos": 40916868986880.0, "grad_norm": 2.0089158406542524, "language_loss": 0.74998611, "learning_rate": 3.371494591560139e-06, "loss": 0.77511597, "num_input_tokens_seen": 3946870, "step": 188, "time_per_iteration": 2.8631174564361572 }, { "auxiliary_loss_clip": 0.01298557, "auxiliary_loss_mlp": 0.01067058, "balance_loss_clip": 1.14124644, "balance_loss_mlp": 1.04474187, "epoch": 0.011363294754246205, "flos": 66302697790080.0, "grad_norm": 0.7620731385906954, "language_loss": 0.56192517, "learning_rate": 3.3749102610218297e-06, "loss": 0.5855813, "num_input_tokens_seen": 4010005, "step": 189, "time_per_iteration": 3.2704074382781982 }, { "auxiliary_loss_clip": 0.01402206, "auxiliary_loss_mlp": 0.011217, "balance_loss_clip": 1.11730003, "balance_loss_mlp": 1.06662548, "epoch": 0.011423418006914174, "flos": 24900279730560.0, "grad_norm": 2.640219984380571, "language_loss": 0.95085573, "learning_rate": 3.3783079057586833e-06, "loss": 0.97609472, "num_input_tokens_seen": 4029035, "step": 190, "time_per_iteration": 2.6898255348205566 }, { "auxiliary_loss_clip": 0.01405088, "auxiliary_loss_mlp": 0.01103893, "balance_loss_clip": 1.11979234, "balance_loss_mlp": 1.05167961, "epoch": 0.011483541259582144, "flos": 19791879298560.0, "grad_norm": 4.133813113517846, "language_loss": 0.8463847, "learning_rate": 3.3816877150079665e-06, "loss": 0.8714745, "num_input_tokens_seen": 4046995, "step": 191, "time_per_iteration": 2.71589994430542 }, { "auxiliary_loss_clip": 0.01403196, "auxiliary_loss_mlp": 0.01118385, "balance_loss_clip": 1.11570346, "balance_loss_mlp": 1.06624269, "epoch": 0.011543664512250112, "flos": 26176939896960.0, "grad_norm": 2.0065119945705887, "language_loss": 0.91894913, "learning_rate": 3.385049875042367e-06, "loss": 0.94416493, "num_input_tokens_seen": 4065865, "step": 192, "time_per_iteration": 2.775974988937378 }, { "auxiliary_loss_clip": 0.01398496, "auxiliary_loss_mlp": 0.01118924, "balance_loss_clip": 1.11665678, "balance_loss_mlp": 1.06117916, "epoch": 0.011603787764918083, "flos": 23768985905280.0, "grad_norm": 2.10033302347605, "language_loss": 0.86923265, "learning_rate": 3.3883945692315938e-06, "loss": 0.89440691, "num_input_tokens_seen": 4085305, "step": 193, "time_per_iteration": 2.792947292327881 }, { "auxiliary_loss_clip": 0.01402535, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.11514282, "balance_loss_mlp": 1.05061066, "epoch": 0.011663911017586051, "flos": 25954688494080.0, "grad_norm": 2.2253165290939076, "language_loss": 0.92296255, "learning_rate": 3.3917219781023906e-06, "loss": 0.94801068, "num_input_tokens_seen": 4105185, "step": 194, "time_per_iteration": 2.6886558532714844 }, { "auxiliary_loss_clip": 0.01407209, "auxiliary_loss_mlp": 0.01108641, "balance_loss_clip": 1.11930478, "balance_loss_mlp": 1.05630851, "epoch": 0.01172403427025402, "flos": 17895149625600.0, "grad_norm": 2.4241235245311503, "language_loss": 0.89768875, "learning_rate": 3.3950322793970014e-06, "loss": 0.92284721, "num_input_tokens_seen": 4123160, "step": 195, "time_per_iteration": 2.654517889022827 }, { "auxiliary_loss_clip": 0.01400339, "auxiliary_loss_mlp": 0.01114485, "balance_loss_clip": 1.11779022, "balance_loss_mlp": 1.05981565, "epoch": 0.01178415752292199, "flos": 17894539094400.0, "grad_norm": 3.1130999341447385, "language_loss": 0.86019921, "learning_rate": 3.3983256481301445e-06, "loss": 0.88534749, "num_input_tokens_seen": 4140425, "step": 196, "time_per_iteration": 2.643598794937134 }, { "auxiliary_loss_clip": 0.01398067, "auxiliary_loss_mlp": 0.01107082, "balance_loss_clip": 1.11464977, "balance_loss_mlp": 1.05308056, "epoch": 0.011844280775589959, "flos": 22893555634560.0, "grad_norm": 3.666533247373141, "language_loss": 0.93052697, "learning_rate": 3.4016022566445335e-06, "loss": 0.95557845, "num_input_tokens_seen": 4159555, "step": 197, "time_per_iteration": 2.7120354175567627 }, { "auxiliary_loss_clip": 0.01396424, "auxiliary_loss_mlp": 0.01112388, "balance_loss_clip": 1.11625624, "balance_loss_mlp": 1.05943501, "epoch": 0.01190440402825793, "flos": 26980333441920.0, "grad_norm": 1.9614954763997827, "language_loss": 0.79043806, "learning_rate": 3.4048622746649966e-06, "loss": 0.81552619, "num_input_tokens_seen": 4180480, "step": 198, "time_per_iteration": 2.774059772491455 }, { "auxiliary_loss_clip": 0.0139305, "auxiliary_loss_mlp": 0.01120527, "balance_loss_clip": 1.11708748, "balance_loss_mlp": 1.06821764, "epoch": 0.011964527280925898, "flos": 20521584092160.0, "grad_norm": 1.8823459083646328, "language_loss": 0.88239717, "learning_rate": 3.4081058693512278e-06, "loss": 0.90753293, "num_input_tokens_seen": 4198835, "step": 199, "time_per_iteration": 2.6808881759643555 }, { "auxiliary_loss_clip": 0.01403709, "auxiliary_loss_mlp": 0.0112899, "balance_loss_clip": 1.11951399, "balance_loss_mlp": 1.07200766, "epoch": 0.012024650533593867, "flos": 27745984771200.0, "grad_norm": 2.0663906916258497, "language_loss": 0.81151628, "learning_rate": 3.411333205349222e-06, "loss": 0.83684325, "num_input_tokens_seen": 4219335, "step": 200, "time_per_iteration": 2.625380516052246 }, { "auxiliary_loss_clip": 0.0140201, "auxiliary_loss_mlp": 0.01104413, "balance_loss_clip": 1.11633158, "balance_loss_mlp": 1.05048287, "epoch": 0.012084773786261837, "flos": 10452017076480.0, "grad_norm": 2.253120238884594, "language_loss": 0.87696433, "learning_rate": 3.4145444448414217e-06, "loss": 0.90202856, "num_input_tokens_seen": 4236940, "step": 201, "time_per_iteration": 2.6062326431274414 }, { "auxiliary_loss_clip": 0.01399494, "auxiliary_loss_mlp": 0.01115643, "balance_loss_clip": 1.11764228, "balance_loss_mlp": 1.0614028, "epoch": 0.012144897038929806, "flos": 23105751229440.0, "grad_norm": 2.088192664231089, "language_loss": 0.84052485, "learning_rate": 3.4177397475956223e-06, "loss": 0.86567622, "num_input_tokens_seen": 4256755, "step": 202, "time_per_iteration": 2.6981592178344727 }, { "auxiliary_loss_clip": 0.01388741, "auxiliary_loss_mlp": 0.0111019, "balance_loss_clip": 1.11006808, "balance_loss_mlp": 1.05771446, "epoch": 0.012205020291597776, "flos": 21033203460480.0, "grad_norm": 1.7861279575653157, "language_loss": 0.89964712, "learning_rate": 3.4209192710126685e-06, "loss": 0.92463642, "num_input_tokens_seen": 4276505, "step": 203, "time_per_iteration": 2.668757438659668 }, { "auxiliary_loss_clip": 0.01276289, "auxiliary_loss_mlp": 0.01095021, "balance_loss_clip": 1.12578154, "balance_loss_mlp": 1.07470798, "epoch": 0.012265143544265745, "flos": 68447785075200.0, "grad_norm": 1.0265297625980543, "language_loss": 0.61255801, "learning_rate": 3.4240831701729837e-06, "loss": 0.63627112, "num_input_tokens_seen": 4330965, "step": 204, "time_per_iteration": 3.161599636077881 }, { "auxiliary_loss_clip": 0.01396271, "auxiliary_loss_mlp": 0.01111806, "balance_loss_clip": 1.11291122, "balance_loss_mlp": 1.05930579, "epoch": 0.012325266796933715, "flos": 17019252478080.0, "grad_norm": 2.3248674300118184, "language_loss": 0.91324663, "learning_rate": 3.4272315978819516e-06, "loss": 0.93832743, "num_input_tokens_seen": 4348200, "step": 205, "time_per_iteration": 2.6764047145843506 }, { "auxiliary_loss_clip": 0.01404558, "auxiliary_loss_mlp": 0.0112167, "balance_loss_clip": 1.11773109, "balance_loss_mlp": 1.06773925, "epoch": 0.012385390049601683, "flos": 20190056538240.0, "grad_norm": 2.1088315130515207, "language_loss": 0.89305568, "learning_rate": 3.4303647047142043e-06, "loss": 0.91831797, "num_input_tokens_seen": 4365460, "step": 206, "time_per_iteration": 2.7157227993011475 }, { "auxiliary_loss_clip": 0.0139534, "auxiliary_loss_mlp": 0.01100957, "balance_loss_clip": 1.11176991, "balance_loss_mlp": 1.04888678, "epoch": 0.012445513302269652, "flos": 16253134272000.0, "grad_norm": 2.399816031687551, "language_loss": 0.95542914, "learning_rate": 3.43348263905683e-06, "loss": 0.9803921, "num_input_tokens_seen": 4383650, "step": 207, "time_per_iteration": 2.611348867416382 }, { "auxiliary_loss_clip": 0.01393005, "auxiliary_loss_mlp": 0.01117764, "balance_loss_clip": 1.11658561, "balance_loss_mlp": 1.06497812, "epoch": 0.012505636554937622, "flos": 23769380954880.0, "grad_norm": 1.8144323603981871, "language_loss": 0.75985783, "learning_rate": 3.436585547151547e-06, "loss": 0.78496552, "num_input_tokens_seen": 4403765, "step": 208, "time_per_iteration": 2.7184154987335205 }, { "auxiliary_loss_clip": 0.0138146, "auxiliary_loss_mlp": 0.01108623, "balance_loss_clip": 1.11071992, "balance_loss_mlp": 1.05576587, "epoch": 0.012565759807605591, "flos": 30591546157440.0, "grad_norm": 2.2326965650696855, "language_loss": 0.98386943, "learning_rate": 3.4396735731358586e-06, "loss": 1.00877023, "num_input_tokens_seen": 4421935, "step": 209, "time_per_iteration": 2.7354249954223633 }, { "auxiliary_loss_clip": 0.01387012, "auxiliary_loss_mlp": 0.0111836, "balance_loss_clip": 1.11136842, "balance_loss_mlp": 1.06490695, "epoch": 0.012625883060273561, "flos": 40113511355520.0, "grad_norm": 9.084733304650118, "language_loss": 0.85514843, "learning_rate": 3.4427468590832302e-06, "loss": 0.88020217, "num_input_tokens_seen": 4441470, "step": 210, "time_per_iteration": 2.888749122619629 }, { "auxiliary_loss_clip": 0.01384384, "auxiliary_loss_mlp": 0.01121559, "balance_loss_clip": 1.11018038, "balance_loss_mlp": 1.07115781, "epoch": 0.01268600631294153, "flos": 27089178629760.0, "grad_norm": 3.431917100192063, "language_loss": 0.97194636, "learning_rate": 3.445805545042314e-06, "loss": 0.99700582, "num_input_tokens_seen": 4459950, "step": 211, "time_per_iteration": 2.7465193271636963 }, { "auxiliary_loss_clip": 0.01393556, "auxiliary_loss_mlp": 0.01123542, "balance_loss_clip": 1.11511767, "balance_loss_mlp": 1.06999326, "epoch": 0.012746129565609499, "flos": 16982767238400.0, "grad_norm": 2.3992368053115163, "language_loss": 0.9508543, "learning_rate": 3.448849769075239e-06, "loss": 0.97602528, "num_input_tokens_seen": 4478390, "step": 212, "time_per_iteration": 2.6340651512145996 }, { "auxiliary_loss_clip": 0.01381697, "auxiliary_loss_mlp": 0.01116386, "balance_loss_clip": 1.112149, "balance_loss_mlp": 1.06381512, "epoch": 0.012806252818277469, "flos": 46533476995200.0, "grad_norm": 1.701444843398511, "language_loss": 0.76078421, "learning_rate": 3.4518796672950093e-06, "loss": 0.78576505, "num_input_tokens_seen": 4501665, "step": 213, "time_per_iteration": 2.9250640869140625 }, { "auxiliary_loss_clip": 0.01385821, "auxiliary_loss_mlp": 0.01111776, "balance_loss_clip": 1.11002433, "balance_loss_mlp": 1.06056333, "epoch": 0.012866376070945438, "flos": 14388616120320.0, "grad_norm": 3.5300370267625922, "language_loss": 0.86698866, "learning_rate": 3.4548953739020187e-06, "loss": 0.89196461, "num_input_tokens_seen": 4519055, "step": 214, "time_per_iteration": 2.645289659500122 }, { "auxiliary_loss_clip": 0.01383455, "auxiliary_loss_mlp": 0.01128262, "balance_loss_clip": 1.1159339, "balance_loss_mlp": 1.07359219, "epoch": 0.012926499323613408, "flos": 26140813793280.0, "grad_norm": 2.14433888305053, "language_loss": 0.77582061, "learning_rate": 3.4578970212197196e-06, "loss": 0.80093777, "num_input_tokens_seen": 4540870, "step": 215, "time_per_iteration": 2.7315175533294678 }, { "auxiliary_loss_clip": 0.01391951, "auxiliary_loss_mlp": 0.01115104, "balance_loss_clip": 1.11440635, "balance_loss_mlp": 1.0638206, "epoch": 0.012986622576281377, "flos": 30117202128000.0, "grad_norm": 2.2964706747038233, "language_loss": 0.90423942, "learning_rate": 3.460884739729461e-06, "loss": 0.92930996, "num_input_tokens_seen": 4560395, "step": 216, "time_per_iteration": 2.724698781967163 }, { "auxiliary_loss_clip": 0.01384729, "auxiliary_loss_mlp": 0.01113374, "balance_loss_clip": 1.10847259, "balance_loss_mlp": 1.06096959, "epoch": 0.013046745828949347, "flos": 13954025468160.0, "grad_norm": 3.60062834696173, "language_loss": 0.93473232, "learning_rate": 3.463858658104523e-06, "loss": 0.95971346, "num_input_tokens_seen": 4575785, "step": 217, "time_per_iteration": 5.762276649475098 }, { "auxiliary_loss_clip": 0.01377712, "auxiliary_loss_mlp": 0.0110874, "balance_loss_clip": 1.10726643, "balance_loss_mlp": 1.05433273, "epoch": 0.013106869081617315, "flos": 17347835116800.0, "grad_norm": 1.943339896357513, "language_loss": 0.93811166, "learning_rate": 3.4668189032433696e-06, "loss": 0.96297616, "num_input_tokens_seen": 4594985, "step": 218, "time_per_iteration": 5.832701206207275 }, { "auxiliary_loss_clip": 0.01372884, "auxiliary_loss_mlp": 0.01106717, "balance_loss_clip": 1.10647273, "balance_loss_mlp": 1.05552888, "epoch": 0.013166992334285284, "flos": 25884914325120.0, "grad_norm": 2.252873600345955, "language_loss": 0.86196327, "learning_rate": 3.46976560030214e-06, "loss": 0.88675928, "num_input_tokens_seen": 4616125, "step": 219, "time_per_iteration": 2.794581651687622 }, { "auxiliary_loss_clip": 0.0137885, "auxiliary_loss_mlp": 0.01102953, "balance_loss_clip": 1.10957599, "balance_loss_mlp": 1.05188394, "epoch": 0.013227115586953254, "flos": 31175956437120.0, "grad_norm": 1.897987121161891, "language_loss": 0.8748548, "learning_rate": 3.4726988727263976e-06, "loss": 0.89967287, "num_input_tokens_seen": 4637795, "step": 220, "time_per_iteration": 2.799927234649658 }, { "auxiliary_loss_clip": 0.01370688, "auxiliary_loss_mlp": 0.01115596, "balance_loss_clip": 1.10440111, "balance_loss_mlp": 1.0679127, "epoch": 0.013287238839621223, "flos": 20409470766720.0, "grad_norm": 3.2557072980071795, "language_loss": 0.86437249, "learning_rate": 3.475618842282164e-06, "loss": 0.88923532, "num_input_tokens_seen": 4656835, "step": 221, "time_per_iteration": 2.7040672302246094 }, { "auxiliary_loss_clip": 0.01376134, "auxiliary_loss_mlp": 0.01116397, "balance_loss_clip": 1.10384834, "balance_loss_mlp": 1.0637064, "epoch": 0.013347362092289193, "flos": 14137134024960.0, "grad_norm": 2.585706849100757, "language_loss": 0.92369294, "learning_rate": 3.4785256290862486e-06, "loss": 0.94861829, "num_input_tokens_seen": 4673015, "step": 222, "time_per_iteration": 2.6648194789886475 }, { "auxiliary_loss_clip": 0.01373283, "auxiliary_loss_mlp": 0.01106423, "balance_loss_clip": 1.10636806, "balance_loss_mlp": 1.05156267, "epoch": 0.013407485344957162, "flos": 21797705554560.0, "grad_norm": 7.739608779999776, "language_loss": 0.95708215, "learning_rate": 3.481419351635897e-06, "loss": 0.98187923, "num_input_tokens_seen": 4692355, "step": 223, "time_per_iteration": 2.7261807918548584 }, { "auxiliary_loss_clip": 0.01374555, "auxiliary_loss_mlp": 0.0110963, "balance_loss_clip": 1.10768425, "balance_loss_mlp": 1.05870414, "epoch": 0.013467608597625132, "flos": 18621622195200.0, "grad_norm": 2.673591615227502, "language_loss": 0.88031876, "learning_rate": 3.484300126837776e-06, "loss": 0.90516055, "num_input_tokens_seen": 4710080, "step": 224, "time_per_iteration": 2.601686477661133 }, { "auxiliary_loss_clip": 0.01374533, "auxiliary_loss_mlp": 0.01103, "balance_loss_clip": 1.10679817, "balance_loss_mlp": 1.04804444, "epoch": 0.013527731850293101, "flos": 18552314903040.0, "grad_norm": 3.0722216996453535, "language_loss": 0.89625597, "learning_rate": 3.487168070036317e-06, "loss": 0.9210313, "num_input_tokens_seen": 4728980, "step": 225, "time_per_iteration": 2.6677513122558594 }, { "auxiliary_loss_clip": 0.01369955, "auxiliary_loss_mlp": 0.0112021, "balance_loss_clip": 1.10561275, "balance_loss_mlp": 1.06675696, "epoch": 0.01358785510296107, "flos": 19165381257600.0, "grad_norm": 1.9576206039109396, "language_loss": 0.98980033, "learning_rate": 3.4900232950414224e-06, "loss": 1.01470196, "num_input_tokens_seen": 4747020, "step": 226, "time_per_iteration": 2.8320930004119873 }, { "auxiliary_loss_clip": 0.01375268, "auxiliary_loss_mlp": 0.01110039, "balance_loss_clip": 1.10837173, "balance_loss_mlp": 1.05572701, "epoch": 0.01364797835562904, "flos": 23329941966720.0, "grad_norm": 2.3303410550109245, "language_loss": 0.90965348, "learning_rate": 3.4928659141555727e-06, "loss": 0.93450654, "num_input_tokens_seen": 4765000, "step": 227, "time_per_iteration": 2.648606061935425 }, { "auxiliary_loss_clip": 0.01255161, "auxiliary_loss_mlp": 0.01079249, "balance_loss_clip": 1.11229861, "balance_loss_mlp": 1.06017554, "epoch": 0.013708101608297009, "flos": 70993746097920.0, "grad_norm": 0.9472069433514878, "language_loss": 0.57650995, "learning_rate": 3.4956960382003234e-06, "loss": 0.59985405, "num_input_tokens_seen": 4833210, "step": 228, "time_per_iteration": 3.246328592300415 }, { "auxiliary_loss_clip": 0.01366835, "auxiliary_loss_mlp": 0.01117377, "balance_loss_clip": 1.10507822, "balance_loss_mlp": 1.06711841, "epoch": 0.013768224860964979, "flos": 16325170997760.0, "grad_norm": 2.957038430634678, "language_loss": 0.87773621, "learning_rate": 3.4985137765422354e-06, "loss": 0.90257835, "num_input_tokens_seen": 4850120, "step": 229, "time_per_iteration": 2.6319024562835693 }, { "auxiliary_loss_clip": 0.01375278, "auxiliary_loss_mlp": 0.01098609, "balance_loss_clip": 1.10567176, "balance_loss_mlp": 1.04873204, "epoch": 0.013828348113632948, "flos": 20193037367040.0, "grad_norm": 4.72663824849547, "language_loss": 0.83937395, "learning_rate": 3.501319237118231e-06, "loss": 0.86411285, "num_input_tokens_seen": 4866215, "step": 230, "time_per_iteration": 2.7026398181915283 }, { "auxiliary_loss_clip": 0.01373544, "auxiliary_loss_mlp": 0.01113683, "balance_loss_clip": 1.10701275, "balance_loss_mlp": 1.06361556, "epoch": 0.013888471366300916, "flos": 20741070147840.0, "grad_norm": 2.2562202151287867, "language_loss": 0.904212, "learning_rate": 3.5041125264604056e-06, "loss": 0.9290843, "num_input_tokens_seen": 4885630, "step": 231, "time_per_iteration": 2.6424474716186523 }, { "auxiliary_loss_clip": 0.01377759, "auxiliary_loss_mlp": 0.01110232, "balance_loss_clip": 1.11118639, "balance_loss_mlp": 1.06030726, "epoch": 0.013948594618968886, "flos": 22090628966400.0, "grad_norm": 2.0229562700819215, "language_loss": 0.83624899, "learning_rate": 3.5068937497203002e-06, "loss": 0.86112887, "num_input_tokens_seen": 4905570, "step": 232, "time_per_iteration": 2.621704339981079 }, { "auxiliary_loss_clip": 0.01377798, "auxiliary_loss_mlp": 0.01094369, "balance_loss_clip": 1.10229027, "balance_loss_mlp": 1.04253721, "epoch": 0.014008717871636855, "flos": 19063108258560.0, "grad_norm": 5.516695444379509, "language_loss": 0.74727643, "learning_rate": 3.509663010692652e-06, "loss": 0.77199805, "num_input_tokens_seen": 4923535, "step": 233, "time_per_iteration": 2.659188747406006 }, { "auxiliary_loss_clip": 0.01382744, "auxiliary_loss_mlp": 0.01125121, "balance_loss_clip": 1.1099937, "balance_loss_mlp": 1.0723356, "epoch": 0.014068841124304825, "flos": 14530822064640.0, "grad_norm": 2.5763093382937483, "language_loss": 0.85633421, "learning_rate": 3.512420411838642e-06, "loss": 0.88141286, "num_input_tokens_seen": 4939200, "step": 234, "time_per_iteration": 2.610635757446289 }, { "auxiliary_loss_clip": 0.01374562, "auxiliary_loss_mlp": 0.01114672, "balance_loss_clip": 1.10890436, "balance_loss_mlp": 1.06467605, "epoch": 0.014128964376972794, "flos": 18077396256000.0, "grad_norm": 2.467487286445388, "language_loss": 0.89192498, "learning_rate": 3.515166054308634e-06, "loss": 0.91681731, "num_input_tokens_seen": 4956620, "step": 235, "time_per_iteration": 2.668769359588623 }, { "auxiliary_loss_clip": 0.01373018, "auxiliary_loss_mlp": 0.01131641, "balance_loss_clip": 1.11011076, "balance_loss_mlp": 1.08073914, "epoch": 0.014189087629640764, "flos": 25334331678720.0, "grad_norm": 2.143165146200321, "language_loss": 0.85535377, "learning_rate": 3.5179000379644498e-06, "loss": 0.88040036, "num_input_tokens_seen": 4975650, "step": 236, "time_per_iteration": 2.7570323944091797 }, { "auxiliary_loss_clip": 0.01369632, "auxiliary_loss_mlp": 0.01100269, "balance_loss_clip": 1.10296702, "balance_loss_mlp": 1.04905629, "epoch": 0.014249210882308733, "flos": 36139744713600.0, "grad_norm": 2.1351980688483136, "language_loss": 0.82550979, "learning_rate": 3.520622461401154e-06, "loss": 0.85020876, "num_input_tokens_seen": 4997415, "step": 237, "time_per_iteration": 2.811617374420166 }, { "auxiliary_loss_clip": 0.01369728, "auxiliary_loss_mlp": 0.01124352, "balance_loss_clip": 1.10659075, "balance_loss_mlp": 1.07085085, "epoch": 0.014309334134976702, "flos": 12932977461120.0, "grad_norm": 2.0241581748099313, "language_loss": 0.77096599, "learning_rate": 3.5233334219683935e-06, "loss": 0.79590684, "num_input_tokens_seen": 5013905, "step": 238, "time_per_iteration": 2.8044662475585938 }, { "auxiliary_loss_clip": 0.01367496, "auxiliary_loss_mlp": 0.01111406, "balance_loss_clip": 1.10897434, "balance_loss_mlp": 1.06343579, "epoch": 0.014369457387644672, "flos": 20777519473920.0, "grad_norm": 1.8300428555870456, "language_loss": 0.8707583, "learning_rate": 3.526033015791284e-06, "loss": 0.89554727, "num_input_tokens_seen": 5033645, "step": 239, "time_per_iteration": 2.681452751159668 }, { "auxiliary_loss_clip": 0.01353036, "auxiliary_loss_mlp": 0.01103184, "balance_loss_clip": 1.10036874, "balance_loss_mlp": 1.05516672, "epoch": 0.01442958064031264, "flos": 25848536826240.0, "grad_norm": 2.109315431148974, "language_loss": 0.93055749, "learning_rate": 3.528721337790862e-06, "loss": 0.95511973, "num_input_tokens_seen": 5052875, "step": 240, "time_per_iteration": 2.679826021194458 }, { "auxiliary_loss_clip": 0.01360794, "auxiliary_loss_mlp": 0.01103084, "balance_loss_clip": 1.10475957, "balance_loss_mlp": 1.05611515, "epoch": 0.014489703892980611, "flos": 28219718269440.0, "grad_norm": 3.7136133710916575, "language_loss": 0.8482846, "learning_rate": 3.531398481704111e-06, "loss": 0.87292337, "num_input_tokens_seen": 5075005, "step": 241, "time_per_iteration": 2.679126262664795 }, { "auxiliary_loss_clip": 0.01359518, "auxiliary_loss_mlp": 0.01119602, "balance_loss_clip": 1.11010456, "balance_loss_mlp": 1.06931913, "epoch": 0.01454982714564858, "flos": 22490925108480.0, "grad_norm": 1.8502491938168453, "language_loss": 0.88590866, "learning_rate": 3.534064540103573e-06, "loss": 0.9106999, "num_input_tokens_seen": 5091875, "step": 242, "time_per_iteration": 2.7366583347320557 }, { "auxiliary_loss_clip": 0.01359534, "auxiliary_loss_mlp": 0.01104713, "balance_loss_clip": 1.10356677, "balance_loss_mlp": 1.05342889, "epoch": 0.014609950398316548, "flos": 21653201139840.0, "grad_norm": 2.261458758817042, "language_loss": 0.86688942, "learning_rate": 3.536719604416555e-06, "loss": 0.89153194, "num_input_tokens_seen": 5111290, "step": 243, "time_per_iteration": 2.764378070831299 }, { "auxiliary_loss_clip": 0.01364897, "auxiliary_loss_mlp": 0.01106776, "balance_loss_clip": 1.10636568, "balance_loss_mlp": 1.05656552, "epoch": 0.014670073650984519, "flos": 21869993675520.0, "grad_norm": 1.6964959858678799, "language_loss": 0.84256208, "learning_rate": 3.5393637649439464e-06, "loss": 0.86727887, "num_input_tokens_seen": 5132265, "step": 244, "time_per_iteration": 2.630441188812256 }, { "auxiliary_loss_clip": 0.01372266, "auxiliary_loss_mlp": 0.01115072, "balance_loss_clip": 1.10771632, "balance_loss_mlp": 1.06328762, "epoch": 0.014730196903652487, "flos": 23183713699200.0, "grad_norm": 8.49550264430495, "language_loss": 0.78613877, "learning_rate": 3.54199711087864e-06, "loss": 0.81101215, "num_input_tokens_seen": 5148575, "step": 245, "time_per_iteration": 2.6991443634033203 }, { "auxiliary_loss_clip": 0.01371598, "auxiliary_loss_mlp": 0.0110404, "balance_loss_clip": 1.10405719, "balance_loss_mlp": 1.05008554, "epoch": 0.014790320156320457, "flos": 23222605150080.0, "grad_norm": 2.2582939339926305, "language_loss": 0.84165329, "learning_rate": 3.5446197303235913e-06, "loss": 0.86640966, "num_input_tokens_seen": 5170415, "step": 246, "time_per_iteration": 2.726743221282959 }, { "auxiliary_loss_clip": 0.01365538, "auxiliary_loss_mlp": 0.01101456, "balance_loss_clip": 1.10242295, "balance_loss_mlp": 1.05062532, "epoch": 0.014850443408988426, "flos": 15815490963840.0, "grad_norm": 1.9870849133800452, "language_loss": 0.89958012, "learning_rate": 3.5472317103095034e-06, "loss": 0.92425001, "num_input_tokens_seen": 5188565, "step": 247, "time_per_iteration": 2.5998406410217285 }, { "auxiliary_loss_clip": 0.01364581, "auxiliary_loss_mlp": 0.01098108, "balance_loss_clip": 1.09896278, "balance_loss_mlp": 1.0489223, "epoch": 0.014910566661656396, "flos": 22781657790720.0, "grad_norm": 2.0527635487774343, "language_loss": 0.783005, "learning_rate": 3.549833136812155e-06, "loss": 0.80763197, "num_input_tokens_seen": 5207810, "step": 248, "time_per_iteration": 2.689784049987793 }, { "auxiliary_loss_clip": 0.01365896, "auxiliary_loss_mlp": 0.01110511, "balance_loss_clip": 1.10732806, "balance_loss_mlp": 1.06044269, "epoch": 0.014970689914324365, "flos": 26865023806080.0, "grad_norm": 1.9405946352322343, "language_loss": 0.83855766, "learning_rate": 3.552424094769381e-06, "loss": 0.86332172, "num_input_tokens_seen": 5226210, "step": 249, "time_per_iteration": 2.8210339546203613 }, { "auxiliary_loss_clip": 0.01358179, "auxiliary_loss_mlp": 0.01106801, "balance_loss_clip": 1.10089588, "balance_loss_mlp": 1.05802023, "epoch": 0.015030813166992334, "flos": 13985662371840.0, "grad_norm": 2.0689026358419786, "language_loss": 0.93631709, "learning_rate": 3.5550046680977174e-06, "loss": 0.96096689, "num_input_tokens_seen": 5241660, "step": 250, "time_per_iteration": 2.7074570655822754 }, { "auxiliary_loss_clip": 0.01368183, "auxiliary_loss_mlp": 0.01115393, "balance_loss_clip": 1.1065619, "balance_loss_mlp": 1.06415713, "epoch": 0.015090936419660304, "flos": 24717817618560.0, "grad_norm": 2.6509740932573127, "language_loss": 0.9678722, "learning_rate": 3.5575749397087034e-06, "loss": 0.99270797, "num_input_tokens_seen": 5261090, "step": 251, "time_per_iteration": 2.6740176677703857 }, { "auxiliary_loss_clip": 0.01361249, "auxiliary_loss_mlp": 0.01108489, "balance_loss_clip": 1.10063529, "balance_loss_mlp": 1.0597558, "epoch": 0.015151059672328273, "flos": 25738793798400.0, "grad_norm": 1.996044018630987, "language_loss": 0.84516245, "learning_rate": 3.5601349915248707e-06, "loss": 0.86985981, "num_input_tokens_seen": 5279175, "step": 252, "time_per_iteration": 2.7198123931884766 }, { "auxiliary_loss_clip": 0.01356789, "auxiliary_loss_mlp": 0.0111346, "balance_loss_clip": 1.1023767, "balance_loss_mlp": 1.06346345, "epoch": 0.015211182924996243, "flos": 21871214737920.0, "grad_norm": 2.3132428526475275, "language_loss": 0.98516917, "learning_rate": 3.5626849044954064e-06, "loss": 1.0098716, "num_input_tokens_seen": 5296975, "step": 253, "time_per_iteration": 2.6751561164855957 }, { "auxiliary_loss_clip": 0.01244193, "auxiliary_loss_mlp": 0.01100072, "balance_loss_clip": 1.1058414, "balance_loss_mlp": 1.08338308, "epoch": 0.015271306177664212, "flos": 66895080888960.0, "grad_norm": 0.8719135194962525, "language_loss": 0.55628473, "learning_rate": 3.5652247586115167e-06, "loss": 0.57972741, "num_input_tokens_seen": 5358375, "step": 254, "time_per_iteration": 3.2305996417999268 }, { "auxiliary_loss_clip": 0.0136146, "auxiliary_loss_mlp": 0.01119692, "balance_loss_clip": 1.0985806, "balance_loss_mlp": 1.06952846, "epoch": 0.01533142943033218, "flos": 26834069260800.0, "grad_norm": 2.113472843461701, "language_loss": 0.90234184, "learning_rate": 3.567754632921479e-06, "loss": 0.92715329, "num_input_tokens_seen": 5377255, "step": 255, "time_per_iteration": 2.7138473987579346 }, { "auxiliary_loss_clip": 0.01357311, "auxiliary_loss_mlp": 0.01137867, "balance_loss_clip": 1.1001389, "balance_loss_mlp": 1.08803785, "epoch": 0.01539155268300015, "flos": 20813753318400.0, "grad_norm": 2.320838285045027, "language_loss": 0.85392761, "learning_rate": 3.5702746055454075e-06, "loss": 0.87887937, "num_input_tokens_seen": 5395320, "step": 256, "time_per_iteration": 2.7135775089263916 }, { "auxiliary_loss_clip": 0.01363873, "auxiliary_loss_mlp": 0.0112257, "balance_loss_clip": 1.10053098, "balance_loss_mlp": 1.07281172, "epoch": 0.01545167593566812, "flos": 15961862885760.0, "grad_norm": 4.480294478847577, "language_loss": 0.71472508, "learning_rate": 3.5727847536897254e-06, "loss": 0.73958945, "num_input_tokens_seen": 5411970, "step": 257, "time_per_iteration": 6.340675592422485 }, { "auxiliary_loss_clip": 0.01355912, "auxiliary_loss_mlp": 0.01112611, "balance_loss_clip": 1.10014856, "balance_loss_mlp": 1.06280565, "epoch": 0.01551179918833609, "flos": 22601745544320.0, "grad_norm": 2.0292888191897673, "language_loss": 0.94713151, "learning_rate": 3.5752851536613596e-06, "loss": 0.97181678, "num_input_tokens_seen": 5430245, "step": 258, "time_per_iteration": 5.674164772033691 }, { "auxiliary_loss_clip": 0.01356656, "auxiliary_loss_mlp": 0.01113313, "balance_loss_clip": 1.09867072, "balance_loss_mlp": 1.0645566, "epoch": 0.015571922441004058, "flos": 22816706486400.0, "grad_norm": 2.3215886633849236, "language_loss": 0.93037683, "learning_rate": 3.577775880881658e-06, "loss": 0.95507646, "num_input_tokens_seen": 5448905, "step": 259, "time_per_iteration": 2.6286497116088867 }, { "auxiliary_loss_clip": 0.01348977, "auxiliary_loss_mlp": 0.01102171, "balance_loss_clip": 1.10076857, "balance_loss_mlp": 1.05625176, "epoch": 0.015632045693672027, "flos": 18947439486720.0, "grad_norm": 1.9575053933526474, "language_loss": 0.97368109, "learning_rate": 3.5802570099000424e-06, "loss": 0.99819261, "num_input_tokens_seen": 5466405, "step": 260, "time_per_iteration": 2.625072717666626 }, { "auxiliary_loss_clip": 0.01362999, "auxiliary_loss_mlp": 0.01127943, "balance_loss_clip": 1.1010474, "balance_loss_mlp": 1.07940137, "epoch": 0.015692168946339995, "flos": 29971728046080.0, "grad_norm": 2.2828802632863305, "language_loss": 0.87807435, "learning_rate": 3.5827286144073947e-06, "loss": 0.90298378, "num_input_tokens_seen": 5487055, "step": 261, "time_per_iteration": 2.6737279891967773 }, { "auxiliary_loss_clip": 0.01357008, "auxiliary_loss_mlp": 0.01125312, "balance_loss_clip": 1.09822345, "balance_loss_mlp": 1.07665133, "epoch": 0.015752292199007967, "flos": 19392085946880.0, "grad_norm": 5.057676675675106, "language_loss": 0.67100549, "learning_rate": 3.5851907672491904e-06, "loss": 0.69582868, "num_input_tokens_seen": 5506600, "step": 262, "time_per_iteration": 2.651690721511841 }, { "auxiliary_loss_clip": 0.01353953, "auxiliary_loss_mlp": 0.01135541, "balance_loss_clip": 1.09924924, "balance_loss_mlp": 1.08499634, "epoch": 0.015812415451675936, "flos": 20339804338560.0, "grad_norm": 3.0820356667611337, "language_loss": 0.68077701, "learning_rate": 3.587643540438383e-06, "loss": 0.70567191, "num_input_tokens_seen": 5524350, "step": 263, "time_per_iteration": 2.6885130405426025 }, { "auxiliary_loss_clip": 0.01355592, "auxiliary_loss_mlp": 0.01116799, "balance_loss_clip": 1.09620881, "balance_loss_mlp": 1.06766081, "epoch": 0.015872538704343905, "flos": 17525412979200.0, "grad_norm": 3.9089218881424674, "language_loss": 0.85002583, "learning_rate": 3.590087005168037e-06, "loss": 0.87474978, "num_input_tokens_seen": 5542145, "step": 264, "time_per_iteration": 2.6557912826538086 }, { "auxiliary_loss_clip": 0.01360388, "auxiliary_loss_mlp": 0.01102763, "balance_loss_clip": 1.10088885, "balance_loss_mlp": 1.056319, "epoch": 0.015932661957011873, "flos": 15260490944640.0, "grad_norm": 2.7020928553211476, "language_loss": 1.04234743, "learning_rate": 3.5925212318237344e-06, "loss": 1.06697881, "num_input_tokens_seen": 5557920, "step": 265, "time_per_iteration": 2.6262216567993164 }, { "auxiliary_loss_clip": 0.01364512, "auxiliary_loss_mlp": 0.01120309, "balance_loss_clip": 1.1033864, "balance_loss_mlp": 1.06835794, "epoch": 0.015992785209679845, "flos": 20302528999680.0, "grad_norm": 3.1220748516520134, "language_loss": 0.74914098, "learning_rate": 3.5949462899957323e-06, "loss": 0.7739892, "num_input_tokens_seen": 5576290, "step": 266, "time_per_iteration": 2.6244583129882812 }, { "auxiliary_loss_clip": 0.01349738, "auxiliary_loss_mlp": 0.0111189, "balance_loss_clip": 1.1000762, "balance_loss_mlp": 1.06206095, "epoch": 0.016052908462347814, "flos": 23362368969600.0, "grad_norm": 1.8166776194063956, "language_loss": 0.90909529, "learning_rate": 3.5973622484909068e-06, "loss": 0.93371153, "num_input_tokens_seen": 5595205, "step": 267, "time_per_iteration": 2.6753580570220947 }, { "auxiliary_loss_clip": 0.01359091, "auxiliary_loss_mlp": 0.01115968, "balance_loss_clip": 1.10122573, "balance_loss_mlp": 1.06797481, "epoch": 0.016113031715015783, "flos": 21286588976640.0, "grad_norm": 2.450608875877181, "language_loss": 0.85636413, "learning_rate": 3.599769175344462e-06, "loss": 0.88111478, "num_input_tokens_seen": 5612645, "step": 268, "time_per_iteration": 2.7161567211151123 }, { "auxiliary_loss_clip": 0.01351132, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.10226274, "balance_loss_mlp": 1.05475891, "epoch": 0.01617315496768375, "flos": 18914689261440.0, "grad_norm": 2.1714201716772457, "language_loss": 0.88080788, "learning_rate": 3.602167137831432e-06, "loss": 0.90534198, "num_input_tokens_seen": 5628345, "step": 269, "time_per_iteration": 2.6403756141662598 }, { "auxiliary_loss_clip": 0.01357907, "auxiliary_loss_mlp": 0.01111574, "balance_loss_clip": 1.10001528, "balance_loss_mlp": 1.06021833, "epoch": 0.01623327822035172, "flos": 16546488647040.0, "grad_norm": 2.5848702107942803, "language_loss": 0.97077739, "learning_rate": 3.6045562024779565e-06, "loss": 0.99547219, "num_input_tokens_seen": 5645940, "step": 270, "time_per_iteration": 2.635546922683716 }, { "auxiliary_loss_clip": 0.01356007, "auxiliary_loss_mlp": 0.01118132, "balance_loss_clip": 1.10402, "balance_loss_mlp": 1.06918478, "epoch": 0.016293401473019692, "flos": 23513481486720.0, "grad_norm": 2.1115750591463223, "language_loss": 0.86112005, "learning_rate": 3.606936435072361e-06, "loss": 0.8858614, "num_input_tokens_seen": 5665690, "step": 271, "time_per_iteration": 2.6877286434173584 }, { "auxiliary_loss_clip": 0.013537, "auxiliary_loss_mlp": 0.01105687, "balance_loss_clip": 1.0962286, "balance_loss_mlp": 1.057693, "epoch": 0.01635352472568766, "flos": 29016072748800.0, "grad_norm": 2.5391912683658413, "language_loss": 0.81550127, "learning_rate": 3.609307900676025e-06, "loss": 0.84009504, "num_input_tokens_seen": 5683190, "step": 272, "time_per_iteration": 2.6728365421295166 }, { "auxiliary_loss_clip": 0.01348527, "auxiliary_loss_mlp": 0.01120864, "balance_loss_clip": 1.09806561, "balance_loss_mlp": 1.07368064, "epoch": 0.01641364797835563, "flos": 13370513028480.0, "grad_norm": 2.3613573538590487, "language_loss": 0.81075382, "learning_rate": 3.611670663634051e-06, "loss": 0.83544779, "num_input_tokens_seen": 5699780, "step": 273, "time_per_iteration": 2.595008134841919 }, { "auxiliary_loss_clip": 0.01346135, "auxiliary_loss_mlp": 0.01105539, "balance_loss_clip": 1.09398317, "balance_loss_mlp": 1.05749762, "epoch": 0.016473771231023598, "flos": 18878239935360.0, "grad_norm": 2.1979313648400547, "language_loss": 0.9131726, "learning_rate": 3.614024787585744e-06, "loss": 0.9376893, "num_input_tokens_seen": 5716980, "step": 274, "time_per_iteration": 2.684718132019043 }, { "auxiliary_loss_clip": 0.013432, "auxiliary_loss_mlp": 0.01108715, "balance_loss_clip": 1.09515727, "balance_loss_mlp": 1.06062579, "epoch": 0.016533894483691566, "flos": 22601637803520.0, "grad_norm": 1.9719932168994616, "language_loss": 0.88054645, "learning_rate": 3.6163703354748927e-06, "loss": 0.90506566, "num_input_tokens_seen": 5737780, "step": 275, "time_per_iteration": 2.7204532623291016 }, { "auxiliary_loss_clip": 0.01346726, "auxiliary_loss_mlp": 0.01102856, "balance_loss_clip": 1.09623361, "balance_loss_mlp": 1.05312169, "epoch": 0.01659401773635954, "flos": 21507188353920.0, "grad_norm": 1.7930545784536995, "language_loss": 0.80726624, "learning_rate": 3.6187073695598707e-06, "loss": 0.83176208, "num_input_tokens_seen": 5758330, "step": 276, "time_per_iteration": 3.04716157913208 }, { "auxiliary_loss_clip": 0.0133817, "auxiliary_loss_mlp": 0.01096103, "balance_loss_clip": 1.09588337, "balance_loss_mlp": 1.05220985, "epoch": 0.016654140989027507, "flos": 32850973411200.0, "grad_norm": 1.9196343116615175, "language_loss": 0.80707026, "learning_rate": 3.621035951423551e-06, "loss": 0.83141291, "num_input_tokens_seen": 5778340, "step": 277, "time_per_iteration": 2.809645652770996 }, { "auxiliary_loss_clip": 0.01337061, "auxiliary_loss_mlp": 0.0109637, "balance_loss_clip": 1.08979487, "balance_loss_mlp": 1.04923487, "epoch": 0.016714264241695476, "flos": 12306228024960.0, "grad_norm": 2.3224792061881185, "language_loss": 0.80508065, "learning_rate": 3.623356141983041e-06, "loss": 0.82941496, "num_input_tokens_seen": 5794295, "step": 278, "time_per_iteration": 2.604830741882324 }, { "auxiliary_loss_clip": 0.01341116, "auxiliary_loss_mlp": 0.01101968, "balance_loss_clip": 1.09395671, "balance_loss_mlp": 1.05585837, "epoch": 0.016774387494363444, "flos": 27123796362240.0, "grad_norm": 2.0021377353660057, "language_loss": 0.90582991, "learning_rate": 3.6256680014992486e-06, "loss": 0.93026078, "num_input_tokens_seen": 5814405, "step": 279, "time_per_iteration": 2.7193243503570557 }, { "auxiliary_loss_clip": 0.01346095, "auxiliary_loss_mlp": 0.01112065, "balance_loss_clip": 1.09383631, "balance_loss_mlp": 1.06450009, "epoch": 0.016834510747031413, "flos": 20191493082240.0, "grad_norm": 2.9314445951013988, "language_loss": 0.94049025, "learning_rate": 3.6279715895862713e-06, "loss": 0.96507192, "num_input_tokens_seen": 5832795, "step": 280, "time_per_iteration": 2.680924654006958 }, { "auxiliary_loss_clip": 0.01346658, "auxiliary_loss_mlp": 0.01109166, "balance_loss_clip": 1.09285879, "balance_loss_mlp": 1.06060064, "epoch": 0.016894633999699385, "flos": 27274262434560.0, "grad_norm": 2.6758913403282483, "language_loss": 0.74425459, "learning_rate": 3.6302669652206183e-06, "loss": 0.76881289, "num_input_tokens_seen": 5855750, "step": 281, "time_per_iteration": 2.691152811050415 }, { "auxiliary_loss_clip": 0.01343371, "auxiliary_loss_mlp": 0.01117708, "balance_loss_clip": 1.09609079, "balance_loss_mlp": 1.0724318, "epoch": 0.016954757252367354, "flos": 14902964922240.0, "grad_norm": 3.4878028680462005, "language_loss": 0.80255079, "learning_rate": 3.632554186750274e-06, "loss": 0.82716167, "num_input_tokens_seen": 5872610, "step": 282, "time_per_iteration": 2.592664957046509 }, { "auxiliary_loss_clip": 0.01348082, "auxiliary_loss_mlp": 0.01118449, "balance_loss_clip": 1.09700727, "balance_loss_mlp": 1.07114697, "epoch": 0.017014880505035322, "flos": 21358805270400.0, "grad_norm": 2.296781711700251, "language_loss": 0.77719986, "learning_rate": 3.6348333119035937e-06, "loss": 0.80186516, "num_input_tokens_seen": 5892985, "step": 283, "time_per_iteration": 2.6502227783203125 }, { "auxiliary_loss_clip": 0.01347311, "auxiliary_loss_mlp": 0.01092934, "balance_loss_clip": 1.0977478, "balance_loss_mlp": 1.04804015, "epoch": 0.01707500375770329, "flos": 35333154858240.0, "grad_norm": 2.3467060832193414, "language_loss": 0.84246969, "learning_rate": 3.6371043977980503e-06, "loss": 0.86687213, "num_input_tokens_seen": 5914060, "step": 284, "time_per_iteration": 2.8534958362579346 }, { "auxiliary_loss_clip": 0.01337962, "auxiliary_loss_mlp": 0.01100399, "balance_loss_clip": 1.09212708, "balance_loss_mlp": 1.05297756, "epoch": 0.01713512701037126, "flos": 23582070506880.0, "grad_norm": 2.7335752956200388, "language_loss": 0.96998906, "learning_rate": 3.639367500948819e-06, "loss": 0.99437273, "num_input_tokens_seen": 5932860, "step": 285, "time_per_iteration": 2.6338655948638916 }, { "auxiliary_loss_clip": 0.01341319, "auxiliary_loss_mlp": 0.01095606, "balance_loss_clip": 1.09538078, "balance_loss_mlp": 1.05123687, "epoch": 0.01719525026303923, "flos": 27634661544960.0, "grad_norm": 2.294843469150046, "language_loss": 0.94079655, "learning_rate": 3.6416226772772178e-06, "loss": 0.96516573, "num_input_tokens_seen": 5952725, "step": 286, "time_per_iteration": 2.711087942123413 }, { "auxiliary_loss_clip": 0.01332862, "auxiliary_loss_mlp": 0.0109035, "balance_loss_clip": 1.08986938, "balance_loss_mlp": 1.04409683, "epoch": 0.0172553735157072, "flos": 26979722910720.0, "grad_norm": 1.9277896882465477, "language_loss": 0.92464817, "learning_rate": 3.643869982119001e-06, "loss": 0.94888031, "num_input_tokens_seen": 5970560, "step": 287, "time_per_iteration": 2.640267848968506 }, { "auxiliary_loss_clip": 0.01338192, "auxiliary_loss_mlp": 0.01092315, "balance_loss_clip": 1.09039164, "balance_loss_mlp": 1.04651475, "epoch": 0.01731549676837517, "flos": 14056621689600.0, "grad_norm": 2.7883535936791035, "language_loss": 1.01873291, "learning_rate": 3.646109470232502e-06, "loss": 1.04303789, "num_input_tokens_seen": 5982980, "step": 288, "time_per_iteration": 2.558312177658081 }, { "auxiliary_loss_clip": 0.01225082, "auxiliary_loss_mlp": 0.01188305, "balance_loss_clip": 1.09194219, "balance_loss_mlp": 1.17228377, "epoch": 0.017375620021043137, "flos": 66510694471680.0, "grad_norm": 0.9289960013542303, "language_loss": 0.63867617, "learning_rate": 3.6483411958066417e-06, "loss": 0.66281009, "num_input_tokens_seen": 6049445, "step": 289, "time_per_iteration": 3.386254072189331 }, { "auxiliary_loss_clip": 0.01341215, "auxiliary_loss_mlp": 0.01107788, "balance_loss_clip": 1.09622383, "balance_loss_mlp": 1.06482446, "epoch": 0.01743574327371111, "flos": 15225154940160.0, "grad_norm": 2.368974734045724, "language_loss": 0.88156199, "learning_rate": 3.6505652124687957e-06, "loss": 0.90605205, "num_input_tokens_seen": 6064150, "step": 290, "time_per_iteration": 2.5670948028564453 }, { "auxiliary_loss_clip": 0.0133848, "auxiliary_loss_mlp": 0.010946, "balance_loss_clip": 1.09388971, "balance_loss_mlp": 1.04965782, "epoch": 0.017495866526379078, "flos": 25373869574400.0, "grad_norm": 2.2011772664145504, "language_loss": 0.84472585, "learning_rate": 3.6527815732925258e-06, "loss": 0.8690567, "num_input_tokens_seen": 6083920, "step": 291, "time_per_iteration": 2.648452043533325 }, { "auxiliary_loss_clip": 0.01343563, "auxiliary_loss_mlp": 0.01115116, "balance_loss_clip": 1.10129941, "balance_loss_mlp": 1.06607366, "epoch": 0.017555989779047047, "flos": 26359473836160.0, "grad_norm": 1.7675259544479762, "language_loss": 0.72679955, "learning_rate": 3.6549903308051806e-06, "loss": 0.75138628, "num_input_tokens_seen": 6105460, "step": 292, "time_per_iteration": 2.7239537239074707 }, { "auxiliary_loss_clip": 0.01334066, "auxiliary_loss_mlp": 0.01107289, "balance_loss_clip": 1.09397244, "balance_loss_mlp": 1.06170392, "epoch": 0.017616113031715015, "flos": 22338807010560.0, "grad_norm": 2.419616990787406, "language_loss": 0.86866581, "learning_rate": 3.6571915369953646e-06, "loss": 0.89307928, "num_input_tokens_seen": 6122890, "step": 293, "time_per_iteration": 2.642854690551758 }, { "auxiliary_loss_clip": 0.01333726, "auxiliary_loss_mlp": 0.0110557, "balance_loss_clip": 1.09271646, "balance_loss_mlp": 1.06086659, "epoch": 0.017676236284382984, "flos": 20156911263360.0, "grad_norm": 2.112624444766753, "language_loss": 0.80896151, "learning_rate": 3.6593852433202797e-06, "loss": 0.83335447, "num_input_tokens_seen": 6142890, "step": 294, "time_per_iteration": 2.598176956176758 }, { "auxiliary_loss_clip": 0.01334179, "auxiliary_loss_mlp": 0.01113433, "balance_loss_clip": 1.09030747, "balance_loss_mlp": 1.06892014, "epoch": 0.017736359537050956, "flos": 25223331674880.0, "grad_norm": 2.8289841764142416, "language_loss": 0.83806521, "learning_rate": 3.6615715007129453e-06, "loss": 0.86254132, "num_input_tokens_seen": 6162030, "step": 295, "time_per_iteration": 2.750103712081909 }, { "auxiliary_loss_clip": 0.01339845, "auxiliary_loss_mlp": 0.01121984, "balance_loss_clip": 1.09978509, "balance_loss_mlp": 1.0772326, "epoch": 0.017796482789718925, "flos": 20338798757760.0, "grad_norm": 1.8804378237246864, "language_loss": 0.84576106, "learning_rate": 3.6637503595892897e-06, "loss": 0.87037927, "num_input_tokens_seen": 6180540, "step": 296, "time_per_iteration": 4.154251337051392 }, { "auxiliary_loss_clip": 0.01337678, "auxiliary_loss_mlp": 0.01105295, "balance_loss_clip": 1.09463406, "balance_loss_mlp": 1.06154561, "epoch": 0.017856606042386893, "flos": 22379206832640.0, "grad_norm": 2.055710812588959, "language_loss": 0.87810111, "learning_rate": 3.665921869855132e-06, "loss": 0.90253091, "num_input_tokens_seen": 6199425, "step": 297, "time_per_iteration": 4.379676103591919 }, { "auxiliary_loss_clip": 0.0133717, "auxiliary_loss_mlp": 0.01103766, "balance_loss_clip": 1.09343684, "balance_loss_mlp": 1.06004047, "epoch": 0.017916729295054862, "flos": 20230061310720.0, "grad_norm": 2.689351030321763, "language_loss": 0.88947791, "learning_rate": 3.6680860809130346e-06, "loss": 0.91388726, "num_input_tokens_seen": 6219170, "step": 298, "time_per_iteration": 4.1055779457092285 }, { "auxiliary_loss_clip": 0.01333843, "auxiliary_loss_mlp": 0.01121179, "balance_loss_clip": 1.09470236, "balance_loss_mlp": 1.07499719, "epoch": 0.01797685254772283, "flos": 19390972625280.0, "grad_norm": 1.8935027270905305, "language_loss": 0.88550889, "learning_rate": 3.6702430416690516e-06, "loss": 0.91005915, "num_input_tokens_seen": 6237930, "step": 299, "time_per_iteration": 2.611168622970581 }, { "auxiliary_loss_clip": 0.0133938, "auxiliary_loss_mlp": 0.0110718, "balance_loss_clip": 1.09468794, "balance_loss_mlp": 1.06130886, "epoch": 0.018036975800390802, "flos": 24426007528320.0, "grad_norm": 4.075580609786654, "language_loss": 0.64664406, "learning_rate": 3.672392800539357e-06, "loss": 0.67110968, "num_input_tokens_seen": 6257170, "step": 300, "time_per_iteration": 2.645603656768799 }, { "auxiliary_loss_clip": 0.01338559, "auxiliary_loss_mlp": 0.01111665, "balance_loss_clip": 1.09775913, "balance_loss_mlp": 1.06636548, "epoch": 0.01809709905305877, "flos": 15778933896960.0, "grad_norm": 2.5071418214687515, "language_loss": 0.87940675, "learning_rate": 3.6745354054567686e-06, "loss": 0.90390897, "num_input_tokens_seen": 6274780, "step": 301, "time_per_iteration": 2.6035923957824707 }, { "auxiliary_loss_clip": 0.01238361, "auxiliary_loss_mlp": 0.01073699, "balance_loss_clip": 1.1100142, "balance_loss_mlp": 1.05901265, "epoch": 0.01815722230572674, "flos": 67348382526720.0, "grad_norm": 0.8350739260664176, "language_loss": 0.62219667, "learning_rate": 3.676670903877158e-06, "loss": 0.64531732, "num_input_tokens_seen": 6340435, "step": 302, "time_per_iteration": 3.3307297229766846 }, { "auxiliary_loss_clip": 0.0132981, "auxiliary_loss_mlp": 0.01110918, "balance_loss_clip": 1.0910126, "balance_loss_mlp": 1.06507051, "epoch": 0.01821734555839471, "flos": 15485615435520.0, "grad_norm": 2.115144575016314, "language_loss": 0.89737153, "learning_rate": 3.6787993427857567e-06, "loss": 0.9217788, "num_input_tokens_seen": 6358160, "step": 303, "time_per_iteration": 2.6773293018341064 }, { "auxiliary_loss_clip": 0.01335628, "auxiliary_loss_mlp": 0.01118481, "balance_loss_clip": 1.09579217, "balance_loss_mlp": 1.07237101, "epoch": 0.018277468811062677, "flos": 24097424889600.0, "grad_norm": 1.8670669350935472, "language_loss": 0.80417514, "learning_rate": 3.680920768703364e-06, "loss": 0.82871628, "num_input_tokens_seen": 6378485, "step": 304, "time_per_iteration": 2.691347360610962 }, { "auxiliary_loss_clip": 0.01330802, "auxiliary_loss_mlp": 0.01091671, "balance_loss_clip": 1.09832263, "balance_loss_mlp": 1.04858923, "epoch": 0.01833759206373065, "flos": 20959335141120.0, "grad_norm": 1.6863564291935742, "language_loss": 0.82761526, "learning_rate": 3.6830352276924415e-06, "loss": 0.85184002, "num_input_tokens_seen": 6397845, "step": 305, "time_per_iteration": 2.6883981227874756 }, { "auxiliary_loss_clip": 0.01330759, "auxiliary_loss_mlp": 0.01093908, "balance_loss_clip": 1.09012437, "balance_loss_mlp": 1.05115986, "epoch": 0.018397715316398618, "flos": 19390757143680.0, "grad_norm": 2.1780708917523297, "language_loss": 0.91148543, "learning_rate": 3.685142765363119e-06, "loss": 0.93573213, "num_input_tokens_seen": 6416475, "step": 306, "time_per_iteration": 2.6465187072753906 }, { "auxiliary_loss_clip": 0.01324743, "auxiliary_loss_mlp": 0.01091696, "balance_loss_clip": 1.08900762, "balance_loss_mlp": 1.04882836, "epoch": 0.018457838569066586, "flos": 29132531619840.0, "grad_norm": 3.4680205003751072, "language_loss": 0.86581063, "learning_rate": 3.687243426879095e-06, "loss": 0.88997507, "num_input_tokens_seen": 6437520, "step": 307, "time_per_iteration": 2.7787318229675293 }, { "auxiliary_loss_clip": 0.01326572, "auxiliary_loss_mlp": 0.01110018, "balance_loss_clip": 1.09346747, "balance_loss_mlp": 1.06247783, "epoch": 0.018517961821734555, "flos": 19208654167680.0, "grad_norm": 2.413130156754219, "language_loss": 0.71650648, "learning_rate": 3.6893372569634466e-06, "loss": 0.74087244, "num_input_tokens_seen": 6455680, "step": 308, "time_per_iteration": 2.652973175048828 }, { "auxiliary_loss_clip": 0.01331912, "auxiliary_loss_mlp": 0.01102766, "balance_loss_clip": 1.09061241, "balance_loss_mlp": 1.05911207, "epoch": 0.018578085074402523, "flos": 19863018184320.0, "grad_norm": 2.1869498369051077, "language_loss": 0.91841364, "learning_rate": 3.6914242999043395e-06, "loss": 0.94276047, "num_input_tokens_seen": 6474880, "step": 309, "time_per_iteration": 2.6613030433654785 }, { "auxiliary_loss_clip": 0.01339178, "auxiliary_loss_mlp": 0.01096668, "balance_loss_clip": 1.09145641, "balance_loss_mlp": 1.05084395, "epoch": 0.018638208327070496, "flos": 29606947476480.0, "grad_norm": 2.0400456475786353, "language_loss": 0.72784412, "learning_rate": 3.69350459956065e-06, "loss": 0.75220263, "num_input_tokens_seen": 6495945, "step": 310, "time_per_iteration": 2.705345392227173 }, { "auxiliary_loss_clip": 0.01331019, "auxiliary_loss_mlp": 0.01113021, "balance_loss_clip": 1.09560525, "balance_loss_mlp": 1.06922317, "epoch": 0.018698331579738464, "flos": 45731555907840.0, "grad_norm": 2.1345597100799645, "language_loss": 0.74162471, "learning_rate": 3.695578199367497e-06, "loss": 0.76606506, "num_input_tokens_seen": 6519930, "step": 311, "time_per_iteration": 2.846503496170044 }, { "auxiliary_loss_clip": 0.01338389, "auxiliary_loss_mlp": 0.01104203, "balance_loss_clip": 1.09206033, "balance_loss_mlp": 1.0609777, "epoch": 0.018758454832406433, "flos": 20483662308480.0, "grad_norm": 3.713635021153945, "language_loss": 0.91668129, "learning_rate": 3.6976451423416825e-06, "loss": 0.94110715, "num_input_tokens_seen": 6535070, "step": 312, "time_per_iteration": 2.598400592803955 }, { "auxiliary_loss_clip": 0.01339145, "auxiliary_loss_mlp": 0.01116197, "balance_loss_clip": 1.09512305, "balance_loss_mlp": 1.07034922, "epoch": 0.0188185780850744, "flos": 15777784661760.0, "grad_norm": 4.5530066286460045, "language_loss": 0.89634913, "learning_rate": 3.699705471087043e-06, "loss": 0.92090249, "num_input_tokens_seen": 6554135, "step": 313, "time_per_iteration": 2.6944596767425537 }, { "auxiliary_loss_clip": 0.01340962, "auxiliary_loss_mlp": 0.0109941, "balance_loss_clip": 1.09381938, "balance_loss_mlp": 1.05430174, "epoch": 0.018878701337742373, "flos": 22455732758400.0, "grad_norm": 2.3990870717118455, "language_loss": 0.7335974, "learning_rate": 3.7017592277997256e-06, "loss": 0.75800109, "num_input_tokens_seen": 6572275, "step": 314, "time_per_iteration": 2.6550133228302 }, { "auxiliary_loss_clip": 0.01329658, "auxiliary_loss_mlp": 0.01105546, "balance_loss_clip": 1.09075165, "balance_loss_mlp": 1.06246412, "epoch": 0.018938824590410342, "flos": 30993530238720.0, "grad_norm": 5.81191681220521, "language_loss": 0.89890182, "learning_rate": 3.7038064542733654e-06, "loss": 0.92325383, "num_input_tokens_seen": 6594520, "step": 315, "time_per_iteration": 2.7121222019195557 }, { "auxiliary_loss_clip": 0.0133262, "auxiliary_loss_mlp": 0.01096177, "balance_loss_clip": 1.09287357, "balance_loss_mlp": 1.05209303, "epoch": 0.01899894784307831, "flos": 23258910821760.0, "grad_norm": 2.446494284682687, "language_loss": 0.80517328, "learning_rate": 3.7058471919041945e-06, "loss": 0.82946122, "num_input_tokens_seen": 6614245, "step": 316, "time_per_iteration": 2.640573501586914 }, { "auxiliary_loss_clip": 0.01326654, "auxiliary_loss_mlp": 0.01094904, "balance_loss_clip": 1.09036672, "balance_loss_mlp": 1.05046248, "epoch": 0.01905907109574628, "flos": 17457901367040.0, "grad_norm": 2.3705495670370524, "language_loss": 0.90161496, "learning_rate": 3.7078814816960605e-06, "loss": 0.92583054, "num_input_tokens_seen": 6632015, "step": 317, "time_per_iteration": 2.594388246536255 }, { "auxiliary_loss_clip": 0.01324014, "auxiliary_loss_mlp": 0.01097498, "balance_loss_clip": 1.08944559, "balance_loss_mlp": 1.05281842, "epoch": 0.019119194348414248, "flos": 14970225139200.0, "grad_norm": 7.443622240044352, "language_loss": 0.90836811, "learning_rate": 3.709909364265374e-06, "loss": 0.93258321, "num_input_tokens_seen": 6649015, "step": 318, "time_per_iteration": 2.6647114753723145 }, { "auxiliary_loss_clip": 0.01326579, "auxiliary_loss_mlp": 0.01092817, "balance_loss_clip": 1.0886786, "balance_loss_mlp": 1.05102181, "epoch": 0.01917931760108222, "flos": 25482822503040.0, "grad_norm": 2.232217614618188, "language_loss": 0.93955356, "learning_rate": 3.7119308798459706e-06, "loss": 0.9637475, "num_input_tokens_seen": 6669225, "step": 319, "time_per_iteration": 2.6901800632476807 }, { "auxiliary_loss_clip": 0.01209258, "auxiliary_loss_mlp": 0.01057567, "balance_loss_clip": 1.08611965, "balance_loss_mlp": 1.04288089, "epoch": 0.01923944085375019, "flos": 71556967353600.0, "grad_norm": 1.0009907084180605, "language_loss": 0.59817195, "learning_rate": 3.7139460682939026e-06, "loss": 0.62084019, "num_input_tokens_seen": 6725775, "step": 320, "time_per_iteration": 3.1044812202453613 }, { "auxiliary_loss_clip": 0.01323701, "auxiliary_loss_mlp": 0.01105882, "balance_loss_clip": 1.08827436, "balance_loss_mlp": 1.06291938, "epoch": 0.019299564106418157, "flos": 19682495406720.0, "grad_norm": 3.6735645336458163, "language_loss": 0.89620435, "learning_rate": 3.715954969092154e-06, "loss": 0.92050016, "num_input_tokens_seen": 6744170, "step": 321, "time_per_iteration": 2.650325298309326 }, { "auxiliary_loss_clip": 0.01333523, "auxiliary_loss_mlp": 0.01118534, "balance_loss_clip": 1.09200621, "balance_loss_mlp": 1.07440257, "epoch": 0.019359687359086126, "flos": 24387151991040.0, "grad_norm": 2.289334718991835, "language_loss": 0.82897186, "learning_rate": 3.7179576213552805e-06, "loss": 0.85349244, "num_input_tokens_seen": 6764565, "step": 322, "time_per_iteration": 2.65793514251709 }, { "auxiliary_loss_clip": 0.01332983, "auxiliary_loss_mlp": 0.01092262, "balance_loss_clip": 1.09035325, "balance_loss_mlp": 1.05061018, "epoch": 0.019419810611754094, "flos": 23951376190080.0, "grad_norm": 2.3678949255052912, "language_loss": 0.72983897, "learning_rate": 3.719954063833981e-06, "loss": 0.75409144, "num_input_tokens_seen": 6785310, "step": 323, "time_per_iteration": 2.6827828884124756 }, { "auxiliary_loss_clip": 0.01321298, "auxiliary_loss_mlp": 0.01092254, "balance_loss_clip": 1.08474624, "balance_loss_mlp": 1.04974401, "epoch": 0.019479933864422067, "flos": 22160223567360.0, "grad_norm": 1.9971507164977458, "language_loss": 0.92358303, "learning_rate": 3.721944334919596e-06, "loss": 0.9477185, "num_input_tokens_seen": 6803290, "step": 324, "time_per_iteration": 2.667363405227661 }, { "auxiliary_loss_clip": 0.0133014, "auxiliary_loss_mlp": 0.01089098, "balance_loss_clip": 1.09217644, "balance_loss_mlp": 1.04878139, "epoch": 0.019540057117090035, "flos": 22236821320320.0, "grad_norm": 6.407507213214319, "language_loss": 0.65127969, "learning_rate": 3.7239284726485375e-06, "loss": 0.67547202, "num_input_tokens_seen": 6822570, "step": 325, "time_per_iteration": 2.658700466156006 }, { "auxiliary_loss_clip": 0.01328385, "auxiliary_loss_mlp": 0.01109788, "balance_loss_clip": 1.09598839, "balance_loss_mlp": 1.06675363, "epoch": 0.019600180369758004, "flos": 23076771932160.0, "grad_norm": 1.7177375017641943, "language_loss": 0.76394802, "learning_rate": 3.72590651470665e-06, "loss": 0.78832972, "num_input_tokens_seen": 6841910, "step": 326, "time_per_iteration": 2.6326630115509033 }, { "auxiliary_loss_clip": 0.01322824, "auxiliary_loss_mlp": 0.01103487, "balance_loss_clip": 1.09083152, "balance_loss_mlp": 1.06040514, "epoch": 0.019660303622425972, "flos": 25410857604480.0, "grad_norm": 2.041100065316132, "language_loss": 0.79262185, "learning_rate": 3.727878498433505e-06, "loss": 0.81688493, "num_input_tokens_seen": 6862480, "step": 327, "time_per_iteration": 2.7195518016815186 }, { "auxiliary_loss_clip": 0.0132945, "auxiliary_loss_mlp": 0.01099712, "balance_loss_clip": 1.09292865, "balance_loss_mlp": 1.05832207, "epoch": 0.01972042687509394, "flos": 23657519024640.0, "grad_norm": 2.852301933148325, "language_loss": 0.80569315, "learning_rate": 3.7298444608266328e-06, "loss": 0.82998472, "num_input_tokens_seen": 6882015, "step": 328, "time_per_iteration": 2.6789369583129883 }, { "auxiliary_loss_clip": 0.01327544, "auxiliary_loss_mlp": 0.01094059, "balance_loss_clip": 1.08719349, "balance_loss_mlp": 1.05045235, "epoch": 0.019780550127761913, "flos": 18223480869120.0, "grad_norm": 2.280823996815513, "language_loss": 0.93599927, "learning_rate": 3.731804438545683e-06, "loss": 0.96021533, "num_input_tokens_seen": 6899785, "step": 329, "time_per_iteration": 2.6043548583984375 }, { "auxiliary_loss_clip": 0.0133329, "auxiliary_loss_mlp": 0.0110952, "balance_loss_clip": 1.09211767, "balance_loss_mlp": 1.06629419, "epoch": 0.01984067338042988, "flos": 22418780641920.0, "grad_norm": 2.788704520584699, "language_loss": 0.7476396, "learning_rate": 3.7337584679165324e-06, "loss": 0.77206767, "num_input_tokens_seen": 6918575, "step": 330, "time_per_iteration": 2.706001043319702 }, { "auxiliary_loss_clip": 0.0133006, "auxiliary_loss_mlp": 0.01115344, "balance_loss_clip": 1.09077096, "balance_loss_mlp": 1.07280993, "epoch": 0.01990079663309785, "flos": 17055199013760.0, "grad_norm": 4.201650057157668, "language_loss": 0.93435889, "learning_rate": 3.7357065849353186e-06, "loss": 0.95881295, "num_input_tokens_seen": 6936965, "step": 331, "time_per_iteration": 2.6499180793762207 }, { "auxiliary_loss_clip": 0.01316843, "auxiliary_loss_mlp": 0.01085812, "balance_loss_clip": 1.08825564, "balance_loss_mlp": 1.04563856, "epoch": 0.01996091988576582, "flos": 15961791058560.0, "grad_norm": 2.5475056489813968, "language_loss": 0.9293468, "learning_rate": 3.737648825272422e-06, "loss": 0.95337331, "num_input_tokens_seen": 6953475, "step": 332, "time_per_iteration": 2.5990231037139893 }, { "auxiliary_loss_clip": 0.01325701, "auxiliary_loss_mlp": 0.01091941, "balance_loss_clip": 1.09376514, "balance_loss_mlp": 1.04902601, "epoch": 0.02002104313843379, "flos": 23586451966080.0, "grad_norm": 2.7319388202061106, "language_loss": 0.75380504, "learning_rate": 3.739585224276384e-06, "loss": 0.77798152, "num_input_tokens_seen": 6971630, "step": 333, "time_per_iteration": 2.6225569248199463 }, { "auxiliary_loss_clip": 0.01323488, "auxiliary_loss_mlp": 0.01083816, "balance_loss_clip": 1.08822608, "balance_loss_mlp": 1.04249835, "epoch": 0.02008116639110176, "flos": 34094883352320.0, "grad_norm": 3.3732742696494924, "language_loss": 0.78797042, "learning_rate": 3.7415158169777673e-06, "loss": 0.81204355, "num_input_tokens_seen": 6992775, "step": 334, "time_per_iteration": 2.725562572479248 }, { "auxiliary_loss_clip": 0.01325152, "auxiliary_loss_mlp": 0.01093257, "balance_loss_clip": 1.08535278, "balance_loss_mlp": 1.04867256, "epoch": 0.020141289643769728, "flos": 19683716469120.0, "grad_norm": 1.945115565921162, "language_loss": 0.83465719, "learning_rate": 3.7434406380929575e-06, "loss": 0.8588413, "num_input_tokens_seen": 7011425, "step": 335, "time_per_iteration": 2.638871192932129 }, { "auxiliary_loss_clip": 0.01322365, "auxiliary_loss_mlp": 0.01085854, "balance_loss_clip": 1.08842373, "balance_loss_mlp": 1.04405963, "epoch": 0.020201412896437697, "flos": 20740567357440.0, "grad_norm": 2.3527147371949058, "language_loss": 0.92432821, "learning_rate": 3.745359722027911e-06, "loss": 0.94841033, "num_input_tokens_seen": 7029450, "step": 336, "time_per_iteration": 2.6654980182647705 }, { "auxiliary_loss_clip": 0.01321531, "auxiliary_loss_mlp": 0.01079695, "balance_loss_clip": 1.08577883, "balance_loss_mlp": 1.03818631, "epoch": 0.020261536149105665, "flos": 20266510636800.0, "grad_norm": 1.7223490941555537, "language_loss": 0.88663971, "learning_rate": 3.7472731028818428e-06, "loss": 0.91065204, "num_input_tokens_seen": 7047555, "step": 337, "time_per_iteration": 4.246743440628052 }, { "auxiliary_loss_clip": 0.01312441, "auxiliary_loss_mlp": 0.01102336, "balance_loss_clip": 1.08320296, "balance_loss_mlp": 1.05841899, "epoch": 0.020321659401773638, "flos": 25848752307840.0, "grad_norm": 1.6493597356962735, "language_loss": 0.89869279, "learning_rate": 3.7491808144508626e-06, "loss": 0.92284054, "num_input_tokens_seen": 7068185, "step": 338, "time_per_iteration": 5.869866609573364 }, { "auxiliary_loss_clip": 0.01321566, "auxiliary_loss_mlp": 0.0109858, "balance_loss_clip": 1.08546185, "balance_loss_mlp": 1.05554605, "epoch": 0.020381782654441606, "flos": 17495033051520.0, "grad_norm": 2.1603069065052694, "language_loss": 0.85168982, "learning_rate": 3.7510828902315576e-06, "loss": 0.87589133, "num_input_tokens_seen": 7085955, "step": 339, "time_per_iteration": 2.603130340576172 }, { "auxiliary_loss_clip": 0.01328225, "auxiliary_loss_mlp": 0.01099064, "balance_loss_clip": 1.0902226, "balance_loss_mlp": 1.05524242, "epoch": 0.020441905907109575, "flos": 24243940465920.0, "grad_norm": 2.1746002196087817, "language_loss": 0.88821882, "learning_rate": 3.75297936342452e-06, "loss": 0.91249174, "num_input_tokens_seen": 7106345, "step": 340, "time_per_iteration": 2.7247626781463623 }, { "auxiliary_loss_clip": 0.01322505, "auxiliary_loss_mlp": 0.01085559, "balance_loss_clip": 1.08594203, "balance_loss_mlp": 1.04004502, "epoch": 0.020502029159777543, "flos": 22233301787520.0, "grad_norm": 2.004763613818719, "language_loss": 0.88489276, "learning_rate": 3.7548702669378253e-06, "loss": 0.9089734, "num_input_tokens_seen": 7125070, "step": 341, "time_per_iteration": 2.731411933898926 }, { "auxiliary_loss_clip": 0.01324734, "auxiliary_loss_mlp": 0.01098572, "balance_loss_clip": 1.08451748, "balance_loss_mlp": 1.05479813, "epoch": 0.020562152412445512, "flos": 23987861429760.0, "grad_norm": 2.3638593093640736, "language_loss": 0.80611861, "learning_rate": 3.756755633390458e-06, "loss": 0.83035159, "num_input_tokens_seen": 7144675, "step": 342, "time_per_iteration": 2.6085095405578613 }, { "auxiliary_loss_clip": 0.01313805, "auxiliary_loss_mlp": 0.01098164, "balance_loss_clip": 1.08411694, "balance_loss_mlp": 1.05138612, "epoch": 0.020622275665113484, "flos": 26975305537920.0, "grad_norm": 1.727276092160433, "language_loss": 0.89612651, "learning_rate": 3.7586354951156886e-06, "loss": 0.92024612, "num_input_tokens_seen": 7165505, "step": 343, "time_per_iteration": 2.739912509918213 }, { "auxiliary_loss_clip": 0.01324722, "auxiliary_loss_mlp": 0.01096954, "balance_loss_clip": 1.09109879, "balance_loss_mlp": 1.05518293, "epoch": 0.020682398917781453, "flos": 22600704049920.0, "grad_norm": 2.6902665590614663, "language_loss": 0.78381217, "learning_rate": 3.7605098841644e-06, "loss": 0.80802888, "num_input_tokens_seen": 7184605, "step": 344, "time_per_iteration": 2.638439655303955 }, { "auxiliary_loss_clip": 0.01310552, "auxiliary_loss_mlp": 0.01103983, "balance_loss_clip": 1.08375537, "balance_loss_mlp": 1.05982804, "epoch": 0.02074252217044942, "flos": 15013605790080.0, "grad_norm": 2.2675296623639114, "language_loss": 0.75051636, "learning_rate": 3.7623788323083666e-06, "loss": 0.77466166, "num_input_tokens_seen": 7203065, "step": 345, "time_per_iteration": 2.581258773803711 }, { "auxiliary_loss_clip": 0.01316305, "auxiliary_loss_mlp": 0.01107937, "balance_loss_clip": 1.08855689, "balance_loss_mlp": 1.06447339, "epoch": 0.02080264542311739, "flos": 25337958952320.0, "grad_norm": 2.2144688897761395, "language_loss": 0.90414572, "learning_rate": 3.7642423710434837e-06, "loss": 0.92838824, "num_input_tokens_seen": 7222995, "step": 346, "time_per_iteration": 2.6281676292419434 }, { "auxiliary_loss_clip": 0.01312286, "auxiliary_loss_mlp": 0.01096576, "balance_loss_clip": 1.08357453, "balance_loss_mlp": 1.05621195, "epoch": 0.02086276867578536, "flos": 24388804016640.0, "grad_norm": 3.1106741063140366, "language_loss": 0.79133296, "learning_rate": 3.7661005315929563e-06, "loss": 0.81542158, "num_input_tokens_seen": 7244625, "step": 347, "time_per_iteration": 2.6477038860321045 }, { "auxiliary_loss_clip": 0.01317665, "auxiliary_loss_mlp": 0.01097416, "balance_loss_clip": 1.08921003, "balance_loss_mlp": 1.05328524, "epoch": 0.02092289192845333, "flos": 24462205459200.0, "grad_norm": 3.7065871267995893, "language_loss": 0.71211165, "learning_rate": 3.7679533449104354e-06, "loss": 0.73626244, "num_input_tokens_seen": 7263255, "step": 348, "time_per_iteration": 2.6215686798095703 }, { "auxiliary_loss_clip": 0.01319168, "auxiliary_loss_mlp": 0.01104109, "balance_loss_clip": 1.0859139, "balance_loss_mlp": 1.06066906, "epoch": 0.0209830151811213, "flos": 17451185523840.0, "grad_norm": 2.3976328225512495, "language_loss": 0.77118891, "learning_rate": 3.7698008416831116e-06, "loss": 0.79542166, "num_input_tokens_seen": 7279275, "step": 349, "time_per_iteration": 2.60102915763855 }, { "auxiliary_loss_clip": 0.01304146, "auxiliary_loss_mlp": 0.01101496, "balance_loss_clip": 1.08412242, "balance_loss_mlp": 1.06017756, "epoch": 0.021043138433789268, "flos": 24573995562240.0, "grad_norm": 1.7599420553547571, "language_loss": 0.85191035, "learning_rate": 3.7716430523347664e-06, "loss": 0.87596673, "num_input_tokens_seen": 7300180, "step": 350, "time_per_iteration": 2.7636313438415527 }, { "auxiliary_loss_clip": 0.01310639, "auxiliary_loss_mlp": 0.01090182, "balance_loss_clip": 1.08742464, "balance_loss_mlp": 1.05015147, "epoch": 0.021103261686457236, "flos": 24454053072000.0, "grad_norm": 2.2188224040826956, "language_loss": 0.7998929, "learning_rate": 3.773480007028776e-06, "loss": 0.82390112, "num_input_tokens_seen": 7317430, "step": 351, "time_per_iteration": 2.651803493499756 }, { "auxiliary_loss_clip": 0.01318922, "auxiliary_loss_mlp": 0.01104903, "balance_loss_clip": 1.08851838, "balance_loss_mlp": 1.06093884, "epoch": 0.021163384939125205, "flos": 14683083816960.0, "grad_norm": 2.30399977815629, "language_loss": 0.8746841, "learning_rate": 3.775311735671078e-06, "loss": 0.89892232, "num_input_tokens_seen": 7334875, "step": 352, "time_per_iteration": 2.687080144882202 }, { "auxiliary_loss_clip": 0.01311303, "auxiliary_loss_mlp": 0.01101912, "balance_loss_clip": 1.0859803, "balance_loss_mlp": 1.05861485, "epoch": 0.021223508191793177, "flos": 24493195918080.0, "grad_norm": 2.574621592267882, "language_loss": 0.8247534, "learning_rate": 3.7771382679130878e-06, "loss": 0.84888554, "num_input_tokens_seen": 7355185, "step": 353, "time_per_iteration": 2.7096078395843506 }, { "auxiliary_loss_clip": 0.01308698, "auxiliary_loss_mlp": 0.01092448, "balance_loss_clip": 1.08573294, "balance_loss_mlp": 1.05160654, "epoch": 0.021283631444461146, "flos": 24126978804480.0, "grad_norm": 1.9591973719581535, "language_loss": 0.8089481, "learning_rate": 3.7789596331545845e-06, "loss": 0.83295953, "num_input_tokens_seen": 7374425, "step": 354, "time_per_iteration": 2.658649444580078 }, { "auxiliary_loss_clip": 0.01314249, "auxiliary_loss_mlp": 0.01095812, "balance_loss_clip": 1.08369493, "balance_loss_mlp": 1.05218124, "epoch": 0.021343754697129114, "flos": 25192233475200.0, "grad_norm": 2.22170783568627, "language_loss": 0.81311834, "learning_rate": 3.780775860546545e-06, "loss": 0.837219, "num_input_tokens_seen": 7394175, "step": 355, "time_per_iteration": 2.619551420211792 }, { "auxiliary_loss_clip": 0.01310207, "auxiliary_loss_mlp": 0.01090401, "balance_loss_clip": 1.08222032, "balance_loss_mlp": 1.04851055, "epoch": 0.021403877949797083, "flos": 17274182279040.0, "grad_norm": 2.212340256471132, "language_loss": 0.89746779, "learning_rate": 3.7825869789939474e-06, "loss": 0.92147392, "num_input_tokens_seen": 7412645, "step": 356, "time_per_iteration": 2.5877137184143066 }, { "auxiliary_loss_clip": 0.01308298, "auxiliary_loss_mlp": 0.0108474, "balance_loss_clip": 1.08573771, "balance_loss_mlp": 1.04191971, "epoch": 0.021464001202465055, "flos": 30917435276160.0, "grad_norm": 1.9878508054592678, "language_loss": 0.79956681, "learning_rate": 3.784393017158528e-06, "loss": 0.82349718, "num_input_tokens_seen": 7432275, "step": 357, "time_per_iteration": 2.781755208969116 }, { "auxiliary_loss_clip": 0.0130988, "auxiliary_loss_mlp": 0.01083565, "balance_loss_clip": 1.08250284, "balance_loss_mlp": 1.04417801, "epoch": 0.021524124455133024, "flos": 18186385098240.0, "grad_norm": 2.6679617624252137, "language_loss": 0.76516652, "learning_rate": 3.786194003461506e-06, "loss": 0.78910094, "num_input_tokens_seen": 7450245, "step": 358, "time_per_iteration": 2.63144850730896 }, { "auxiliary_loss_clip": 0.01307251, "auxiliary_loss_mlp": 0.01092013, "balance_loss_clip": 1.08083165, "balance_loss_mlp": 1.04842997, "epoch": 0.021584247707800992, "flos": 13805786039040.0, "grad_norm": 2.344744226979962, "language_loss": 0.88770491, "learning_rate": 3.787989966086264e-06, "loss": 0.91169769, "num_input_tokens_seen": 7466845, "step": 359, "time_per_iteration": 2.641932964324951 }, { "auxiliary_loss_clip": 0.01315087, "auxiliary_loss_mlp": 0.01090441, "balance_loss_clip": 1.08486438, "balance_loss_mlp": 1.05088758, "epoch": 0.02164437096046896, "flos": 23294713703040.0, "grad_norm": 3.6505103877164804, "language_loss": 0.75853801, "learning_rate": 3.789780932980997e-06, "loss": 0.78259325, "num_input_tokens_seen": 7485450, "step": 360, "time_per_iteration": 2.5901477336883545 }, { "auxiliary_loss_clip": 0.01203506, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.07682121, "balance_loss_mlp": 1.01781011, "epoch": 0.02170449421313693, "flos": 68899578341760.0, "grad_norm": 0.8439708743577624, "language_loss": 0.64861441, "learning_rate": 3.79156693186132e-06, "loss": 0.67095727, "num_input_tokens_seen": 7553780, "step": 361, "time_per_iteration": 3.278409957885742 }, { "auxiliary_loss_clip": 0.01306068, "auxiliary_loss_mlp": 0.01086116, "balance_loss_clip": 1.0792098, "balance_loss_mlp": 1.04501224, "epoch": 0.0217646174658049, "flos": 25228539146880.0, "grad_norm": 3.144635825096315, "language_loss": 0.78844237, "learning_rate": 3.7933479902128433e-06, "loss": 0.81236422, "num_input_tokens_seen": 7574155, "step": 362, "time_per_iteration": 2.6302051544189453 }, { "auxiliary_loss_clip": 0.01309585, "auxiliary_loss_mlp": 0.01093258, "balance_loss_clip": 1.08188891, "balance_loss_mlp": 1.05244076, "epoch": 0.02182474071847287, "flos": 22893124671360.0, "grad_norm": 2.019833715135914, "language_loss": 0.92474592, "learning_rate": 3.7951241352937077e-06, "loss": 0.94877434, "num_input_tokens_seen": 7592320, "step": 363, "time_per_iteration": 2.6566081047058105 }, { "auxiliary_loss_clip": 0.01305173, "auxiliary_loss_mlp": 0.01096467, "balance_loss_clip": 1.0816617, "balance_loss_mlp": 1.05693769, "epoch": 0.02188486397114084, "flos": 23658991482240.0, "grad_norm": 2.282586403147275, "language_loss": 0.89844346, "learning_rate": 3.7968953941370915e-06, "loss": 0.92245984, "num_input_tokens_seen": 7611185, "step": 364, "time_per_iteration": 2.711911201477051 }, { "auxiliary_loss_clip": 0.01311963, "auxiliary_loss_mlp": 0.0109247, "balance_loss_clip": 1.08607888, "balance_loss_mlp": 1.04955506, "epoch": 0.021944987223808807, "flos": 21543637680000.0, "grad_norm": 1.948927065488749, "language_loss": 0.79460645, "learning_rate": 3.798661793553676e-06, "loss": 0.81865084, "num_input_tokens_seen": 7631970, "step": 365, "time_per_iteration": 2.6396052837371826 }, { "auxiliary_loss_clip": 0.01306043, "auxiliary_loss_mlp": 0.01100405, "balance_loss_clip": 1.08267248, "balance_loss_mlp": 1.05658317, "epoch": 0.022005110476476776, "flos": 16070887641600.0, "grad_norm": 1.85181498507666, "language_loss": 0.84341359, "learning_rate": 3.8004233601340808e-06, "loss": 0.86747801, "num_input_tokens_seen": 7649745, "step": 366, "time_per_iteration": 2.6278867721557617 }, { "auxiliary_loss_clip": 0.01312113, "auxiliary_loss_mlp": 0.01087574, "balance_loss_clip": 1.08304918, "balance_loss_mlp": 1.04859269, "epoch": 0.022065233729144748, "flos": 21433715084160.0, "grad_norm": 1.9326288300300676, "language_loss": 0.87040466, "learning_rate": 3.8021801202512694e-06, "loss": 0.89440155, "num_input_tokens_seen": 7668830, "step": 367, "time_per_iteration": 2.6410560607910156 }, { "auxiliary_loss_clip": 0.01312217, "auxiliary_loss_mlp": 0.01096053, "balance_loss_clip": 1.08074582, "balance_loss_mlp": 1.05335259, "epoch": 0.022125356981812717, "flos": 21543709507200.0, "grad_norm": 2.7247329926128976, "language_loss": 0.8487373, "learning_rate": 3.803932100062912e-06, "loss": 0.87282002, "num_input_tokens_seen": 7687240, "step": 368, "time_per_iteration": 2.652012825012207 }, { "auxiliary_loss_clip": 0.01312089, "auxiliary_loss_mlp": 0.01079926, "balance_loss_clip": 1.0801568, "balance_loss_mlp": 1.04027653, "epoch": 0.022185480234480685, "flos": 20704153944960.0, "grad_norm": 2.4839328990540794, "language_loss": 0.75997221, "learning_rate": 3.8056793255137264e-06, "loss": 0.78389233, "num_input_tokens_seen": 7704440, "step": 369, "time_per_iteration": 2.601384401321411 }, { "auxiliary_loss_clip": 0.01306737, "auxiliary_loss_mlp": 0.01099274, "balance_loss_clip": 1.08232927, "balance_loss_mlp": 1.05836105, "epoch": 0.022245603487148654, "flos": 25193203142400.0, "grad_norm": 2.189428421230448, "language_loss": 0.82977992, "learning_rate": 3.8074218223377844e-06, "loss": 0.85383999, "num_input_tokens_seen": 7727160, "step": 370, "time_per_iteration": 2.6538548469543457 }, { "auxiliary_loss_clip": 0.01306327, "auxiliary_loss_mlp": 0.01099594, "balance_loss_clip": 1.08127654, "balance_loss_mlp": 1.05713177, "epoch": 0.022305726739816623, "flos": 21395936954880.0, "grad_norm": 1.8569755368340455, "language_loss": 0.81588483, "learning_rate": 3.8091596160607834e-06, "loss": 0.83994406, "num_input_tokens_seen": 7747730, "step": 371, "time_per_iteration": 2.6779489517211914 }, { "auxiliary_loss_clip": 0.01311283, "auxiliary_loss_mlp": 0.01093653, "balance_loss_clip": 1.08593988, "balance_loss_mlp": 1.05169153, "epoch": 0.022365849992484595, "flos": 22492146170880.0, "grad_norm": 2.0622769904034817, "language_loss": 0.83493644, "learning_rate": 3.8108927320022896e-06, "loss": 0.85898578, "num_input_tokens_seen": 7766765, "step": 372, "time_per_iteration": 2.676797866821289 }, { "auxiliary_loss_clip": 0.01303906, "auxiliary_loss_mlp": 0.01091688, "balance_loss_clip": 1.08125615, "balance_loss_mlp": 1.05022752, "epoch": 0.022425973245152563, "flos": 17856581397120.0, "grad_norm": 2.8569846697004424, "language_loss": 0.79004842, "learning_rate": 3.8126211952779548e-06, "loss": 0.81400436, "num_input_tokens_seen": 7784010, "step": 373, "time_per_iteration": 2.593186616897583 }, { "auxiliary_loss_clip": 0.01309731, "auxiliary_loss_mlp": 0.01087409, "balance_loss_clip": 1.08431911, "balance_loss_mlp": 1.0448271, "epoch": 0.022486096497820532, "flos": 15483029656320.0, "grad_norm": 2.5442660874947385, "language_loss": 0.77622557, "learning_rate": 3.8143450308016952e-06, "loss": 0.80019701, "num_input_tokens_seen": 7801305, "step": 374, "time_per_iteration": 2.628392457962036 }, { "auxiliary_loss_clip": 0.0129871, "auxiliary_loss_mlp": 0.01076131, "balance_loss_clip": 1.07404125, "balance_loss_mlp": 1.03395462, "epoch": 0.0225462197504885, "flos": 27784157950080.0, "grad_norm": 1.574507922341891, "language_loss": 0.86032569, "learning_rate": 3.8160642632878525e-06, "loss": 0.88407415, "num_input_tokens_seen": 7823965, "step": 375, "time_per_iteration": 2.6783435344696045 }, { "auxiliary_loss_clip": 0.01307026, "auxiliary_loss_mlp": 0.01102393, "balance_loss_clip": 1.08340597, "balance_loss_mlp": 1.0590483, "epoch": 0.02260634300315647, "flos": 19975490645760.0, "grad_norm": 2.1279260859120286, "language_loss": 0.8901403, "learning_rate": 3.817778917253314e-06, "loss": 0.91423446, "num_input_tokens_seen": 7842115, "step": 376, "time_per_iteration": 2.621629476547241 }, { "auxiliary_loss_clip": 0.01306872, "auxiliary_loss_mlp": 0.01087647, "balance_loss_clip": 1.07870364, "balance_loss_mlp": 1.04868913, "epoch": 0.02266646625582444, "flos": 16028189349120.0, "grad_norm": 3.0367767906095917, "language_loss": 0.75437558, "learning_rate": 3.8194890170196155e-06, "loss": 0.77832079, "num_input_tokens_seen": 7857830, "step": 377, "time_per_iteration": 2.5465245246887207 }, { "auxiliary_loss_clip": 0.01298987, "auxiliary_loss_mlp": 0.01093623, "balance_loss_clip": 1.08128345, "balance_loss_mlp": 1.0517087, "epoch": 0.02272658950849241, "flos": 20404622430720.0, "grad_norm": 2.1955644054597374, "language_loss": 0.99231368, "learning_rate": 3.8211945867150055e-06, "loss": 1.01623976, "num_input_tokens_seen": 7875840, "step": 378, "time_per_iteration": 7.184643983840942 }, { "auxiliary_loss_clip": 0.01202133, "auxiliary_loss_mlp": 0.01040839, "balance_loss_clip": 1.0828104, "balance_loss_mlp": 1.0283463, "epoch": 0.02278671276116038, "flos": 69847332647040.0, "grad_norm": 0.9608118941287621, "language_loss": 0.75395739, "learning_rate": 3.822895650276492e-06, "loss": 0.7763871, "num_input_tokens_seen": 7940190, "step": 379, "time_per_iteration": 4.961140394210815 }, { "auxiliary_loss_clip": 0.01308523, "auxiliary_loss_mlp": 0.01087195, "balance_loss_clip": 1.07820678, "balance_loss_mlp": 1.04792738, "epoch": 0.022846836013828347, "flos": 38508771340800.0, "grad_norm": 3.7276648293904375, "language_loss": 0.78197825, "learning_rate": 3.824592231451859e-06, "loss": 0.8059355, "num_input_tokens_seen": 7960840, "step": 380, "time_per_iteration": 2.7892863750457764 }, { "auxiliary_loss_clip": 0.01301718, "auxiliary_loss_mlp": 0.01088822, "balance_loss_clip": 1.07955217, "balance_loss_mlp": 1.04945946, "epoch": 0.02290695926649632, "flos": 20959478795520.0, "grad_norm": 2.0941800643649855, "language_loss": 0.96743369, "learning_rate": 3.826284353801652e-06, "loss": 0.99133915, "num_input_tokens_seen": 7975500, "step": 381, "time_per_iteration": 2.619854688644409 }, { "auxiliary_loss_clip": 0.01311313, "auxiliary_loss_mlp": 0.01093973, "balance_loss_clip": 1.08192921, "balance_loss_mlp": 1.0539186, "epoch": 0.022967082519164288, "flos": 24022407335040.0, "grad_norm": 2.122042453210184, "language_loss": 0.87664795, "learning_rate": 3.827972040701142e-06, "loss": 0.90070075, "num_input_tokens_seen": 7993880, "step": 382, "time_per_iteration": 2.617398500442505 }, { "auxiliary_loss_clip": 0.01304042, "auxiliary_loss_mlp": 0.01096828, "balance_loss_clip": 1.0821979, "balance_loss_mlp": 1.05760849, "epoch": 0.023027205771832256, "flos": 20997149184000.0, "grad_norm": 1.978420170714987, "language_loss": 0.84990942, "learning_rate": 3.829655315342268e-06, "loss": 0.87391812, "num_input_tokens_seen": 8012730, "step": 383, "time_per_iteration": 2.6345314979553223 }, { "auxiliary_loss_clip": 0.01300873, "auxiliary_loss_mlp": 0.0111136, "balance_loss_clip": 1.08199024, "balance_loss_mlp": 1.0716393, "epoch": 0.023087329024500225, "flos": 21360816432000.0, "grad_norm": 2.0575071112917778, "language_loss": 0.83349717, "learning_rate": 3.831334200735543e-06, "loss": 0.8576194, "num_input_tokens_seen": 8031275, "step": 384, "time_per_iteration": 2.6339902877807617 }, { "auxiliary_loss_clip": 0.0129979, "auxiliary_loss_mlp": 0.010893, "balance_loss_clip": 1.08362782, "balance_loss_mlp": 1.05255938, "epoch": 0.023147452277168194, "flos": 21872435800320.0, "grad_norm": 1.7828777740185773, "language_loss": 0.89289594, "learning_rate": 3.8330087197119426e-06, "loss": 0.91678685, "num_input_tokens_seen": 8051600, "step": 385, "time_per_iteration": 2.690460205078125 }, { "auxiliary_loss_clip": 0.01305297, "auxiliary_loss_mlp": 0.01118129, "balance_loss_clip": 1.08288455, "balance_loss_mlp": 1.07926655, "epoch": 0.023207575529836166, "flos": 18916700423040.0, "grad_norm": 1.9487706588237765, "language_loss": 0.70157433, "learning_rate": 3.83467889492477e-06, "loss": 0.72580856, "num_input_tokens_seen": 8070600, "step": 386, "time_per_iteration": 2.681957721710205 }, { "auxiliary_loss_clip": 0.01305989, "auxiliary_loss_mlp": 0.0109088, "balance_loss_clip": 1.08441973, "balance_loss_mlp": 1.05309081, "epoch": 0.023267698782504134, "flos": 25046005207680.0, "grad_norm": 2.354342660334866, "language_loss": 0.87840039, "learning_rate": 3.836344748851495e-06, "loss": 0.90236908, "num_input_tokens_seen": 8090680, "step": 387, "time_per_iteration": 2.6511123180389404 }, { "auxiliary_loss_clip": 0.01304298, "auxiliary_loss_mlp": 0.01075541, "balance_loss_clip": 1.08178413, "balance_loss_mlp": 1.03658366, "epoch": 0.023327822035172103, "flos": 28879217930880.0, "grad_norm": 2.2068948332198643, "language_loss": 0.8341614, "learning_rate": 3.838006303795566e-06, "loss": 0.85795981, "num_input_tokens_seen": 8114610, "step": 388, "time_per_iteration": 2.7062034606933594 }, { "auxiliary_loss_clip": 0.01301997, "auxiliary_loss_mlp": 0.01089724, "balance_loss_clip": 1.08110905, "balance_loss_mlp": 1.05284107, "epoch": 0.02338794528784007, "flos": 27121533805440.0, "grad_norm": 2.1887236217853863, "language_loss": 0.93710232, "learning_rate": 3.839663581888206e-06, "loss": 0.96101958, "num_input_tokens_seen": 8133975, "step": 389, "time_per_iteration": 2.680280923843384 }, { "auxiliary_loss_clip": 0.01296082, "auxiliary_loss_mlp": 0.01083127, "balance_loss_clip": 1.0818491, "balance_loss_mlp": 1.04397893, "epoch": 0.02344806854050804, "flos": 21322355944320.0, "grad_norm": 1.981860280002506, "language_loss": 0.87747037, "learning_rate": 3.841316605090178e-06, "loss": 0.9012624, "num_input_tokens_seen": 8153570, "step": 390, "time_per_iteration": 2.65970516204834 }, { "auxiliary_loss_clip": 0.01301203, "auxiliary_loss_mlp": 0.01092853, "balance_loss_clip": 1.08357048, "balance_loss_mlp": 1.0568521, "epoch": 0.023508191793176012, "flos": 24789997998720.0, "grad_norm": 2.134782100250632, "language_loss": 0.89370871, "learning_rate": 3.842965395193529e-06, "loss": 0.91764927, "num_input_tokens_seen": 8170075, "step": 391, "time_per_iteration": 2.620009660720825 }, { "auxiliary_loss_clip": 0.01296395, "auxiliary_loss_mlp": 0.01072264, "balance_loss_clip": 1.07956719, "balance_loss_mlp": 1.03521371, "epoch": 0.02356831504584398, "flos": 25995375624960.0, "grad_norm": 2.366558958564603, "language_loss": 0.86076117, "learning_rate": 3.84460997382332e-06, "loss": 0.88444775, "num_input_tokens_seen": 8190420, "step": 392, "time_per_iteration": 2.7171695232391357 }, { "auxiliary_loss_clip": 0.01293283, "auxiliary_loss_mlp": 0.01084283, "balance_loss_clip": 1.07891107, "balance_loss_mlp": 1.04763794, "epoch": 0.02362843829851195, "flos": 19062461813760.0, "grad_norm": 2.038818686720474, "language_loss": 0.89096916, "learning_rate": 3.8462503624393256e-06, "loss": 0.91474473, "num_input_tokens_seen": 8208790, "step": 393, "time_per_iteration": 2.632129669189453 }, { "auxiliary_loss_clip": 0.01304158, "auxiliary_loss_mlp": 0.01102255, "balance_loss_clip": 1.08471596, "balance_loss_mlp": 1.06279635, "epoch": 0.023688561551179918, "flos": 16071031296000.0, "grad_norm": 1.7920692319020195, "language_loss": 0.8156364, "learning_rate": 3.84788658233771e-06, "loss": 0.83970058, "num_input_tokens_seen": 8226885, "step": 394, "time_per_iteration": 2.5932936668395996 }, { "auxiliary_loss_clip": 0.01296851, "auxiliary_loss_mlp": 0.01088191, "balance_loss_clip": 1.07939875, "balance_loss_mlp": 1.04920936, "epoch": 0.023748684803847887, "flos": 21724375939200.0, "grad_norm": 4.539737106404062, "language_loss": 0.85808635, "learning_rate": 3.84951865465269e-06, "loss": 0.88193679, "num_input_tokens_seen": 8246825, "step": 395, "time_per_iteration": 2.6112868785858154 }, { "auxiliary_loss_clip": 0.01194704, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.07210529, "balance_loss_mlp": 1.02319229, "epoch": 0.02380880805651586, "flos": 61926192881280.0, "grad_norm": 0.9258089920958834, "language_loss": 0.6380353, "learning_rate": 3.851146600358172e-06, "loss": 0.66032922, "num_input_tokens_seen": 8302835, "step": 396, "time_per_iteration": 3.031489133834839 }, { "auxiliary_loss_clip": 0.0129188, "auxiliary_loss_mlp": 0.01071022, "balance_loss_clip": 1.07806754, "balance_loss_mlp": 1.03447223, "epoch": 0.023868931309183827, "flos": 20266331068800.0, "grad_norm": 2.3741099598177624, "language_loss": 0.83878696, "learning_rate": 3.852770440269372e-06, "loss": 0.86241591, "num_input_tokens_seen": 8320745, "step": 397, "time_per_iteration": 2.6049532890319824 }, { "auxiliary_loss_clip": 0.01297108, "auxiliary_loss_mlp": 0.01087341, "balance_loss_clip": 1.08104038, "balance_loss_mlp": 1.04890823, "epoch": 0.023929054561851796, "flos": 21139103733120.0, "grad_norm": 4.6847154905409205, "language_loss": 0.84066498, "learning_rate": 3.854390195044404e-06, "loss": 0.86450952, "num_input_tokens_seen": 8339540, "step": 398, "time_per_iteration": 2.6516692638397217 }, { "auxiliary_loss_clip": 0.01295876, "auxiliary_loss_mlp": 0.01078722, "balance_loss_clip": 1.07671928, "balance_loss_mlp": 1.04007471, "epoch": 0.023989177814519765, "flos": 13698521049600.0, "grad_norm": 2.80358563189936, "language_loss": 0.86029691, "learning_rate": 3.856005885185868e-06, "loss": 0.88404286, "num_input_tokens_seen": 8354890, "step": 399, "time_per_iteration": 2.5452589988708496 }, { "auxiliary_loss_clip": 0.01292698, "auxiliary_loss_mlp": 0.01090822, "balance_loss_clip": 1.08074594, "balance_loss_mlp": 1.05308056, "epoch": 0.024049301067187733, "flos": 26322018929280.0, "grad_norm": 2.021318687641168, "language_loss": 0.86254489, "learning_rate": 3.857617531042398e-06, "loss": 0.88638014, "num_input_tokens_seen": 8375845, "step": 400, "time_per_iteration": 2.6626927852630615 }, { "auxiliary_loss_clip": 0.01299822, "auxiliary_loss_mlp": 0.01083301, "balance_loss_clip": 1.08346462, "balance_loss_mlp": 1.04687035, "epoch": 0.024109424319855705, "flos": 24425432910720.0, "grad_norm": 1.735822397657743, "language_loss": 0.79276752, "learning_rate": 3.8592251528102065e-06, "loss": 0.81659877, "num_input_tokens_seen": 8395240, "step": 401, "time_per_iteration": 2.68418025970459 }, { "auxiliary_loss_clip": 0.0129275, "auxiliary_loss_mlp": 0.01091389, "balance_loss_clip": 1.07852793, "balance_loss_mlp": 1.05493474, "epoch": 0.024169547572523674, "flos": 29604397610880.0, "grad_norm": 3.889755427752258, "language_loss": 0.78890866, "learning_rate": 3.8608287705345976e-06, "loss": 0.81274998, "num_input_tokens_seen": 8416950, "step": 402, "time_per_iteration": 2.7509379386901855 }, { "auxiliary_loss_clip": 0.01296434, "auxiliary_loss_mlp": 0.01082712, "balance_loss_clip": 1.07797897, "balance_loss_mlp": 1.04399323, "epoch": 0.024229670825191642, "flos": 22601458235520.0, "grad_norm": 2.49356632429363, "language_loss": 0.94936156, "learning_rate": 3.86242840411147e-06, "loss": 0.97315305, "num_input_tokens_seen": 8433660, "step": 403, "time_per_iteration": 2.5760560035705566 }, { "auxiliary_loss_clip": 0.0129994, "auxiliary_loss_mlp": 0.01091893, "balance_loss_clip": 1.07754242, "balance_loss_mlp": 1.05315053, "epoch": 0.02428979407785961, "flos": 18150258994560.0, "grad_norm": 2.361656575803209, "language_loss": 0.99877387, "learning_rate": 3.864024073288798e-06, "loss": 1.0226922, "num_input_tokens_seen": 8450180, "step": 404, "time_per_iteration": 2.5966458320617676 }, { "auxiliary_loss_clip": 0.01298911, "auxiliary_loss_mlp": 0.01100127, "balance_loss_clip": 1.08096266, "balance_loss_mlp": 1.06312442, "epoch": 0.024349917330527583, "flos": 15304984917120.0, "grad_norm": 2.3162348618509276, "language_loss": 0.8802169, "learning_rate": 3.865615797668091e-06, "loss": 0.90420723, "num_input_tokens_seen": 8467775, "step": 405, "time_per_iteration": 2.5728275775909424 }, { "auxiliary_loss_clip": 0.01306827, "auxiliary_loss_mlp": 0.01097881, "balance_loss_clip": 1.084512, "balance_loss_mlp": 1.06004393, "epoch": 0.024410040583195552, "flos": 20773892200320.0, "grad_norm": 2.7399607903318275, "language_loss": 0.93386561, "learning_rate": 3.867203596705844e-06, "loss": 0.95791268, "num_input_tokens_seen": 8486765, "step": 406, "time_per_iteration": 2.612668991088867 }, { "auxiliary_loss_clip": 0.01299426, "auxiliary_loss_mlp": 0.01088378, "balance_loss_clip": 1.08213782, "balance_loss_mlp": 1.0500164, "epoch": 0.02447016383586352, "flos": 21798854789760.0, "grad_norm": 2.1742012769968526, "language_loss": 0.87128031, "learning_rate": 3.86878748971496e-06, "loss": 0.89515841, "num_input_tokens_seen": 8506515, "step": 407, "time_per_iteration": 2.5982017517089844 }, { "auxiliary_loss_clip": 0.01298266, "auxiliary_loss_mlp": 0.01083858, "balance_loss_clip": 1.08472157, "balance_loss_mlp": 1.04630709, "epoch": 0.02453028708853149, "flos": 33948116380800.0, "grad_norm": 2.1458430439144234, "language_loss": 0.74102569, "learning_rate": 3.8703674958661596e-06, "loss": 0.76484692, "num_input_tokens_seen": 8528035, "step": 408, "time_per_iteration": 2.708670139312744 }, { "auxiliary_loss_clip": 0.01300128, "auxiliary_loss_mlp": 0.01089985, "balance_loss_clip": 1.08222318, "balance_loss_mlp": 1.05233896, "epoch": 0.024590410341199458, "flos": 21793000872960.0, "grad_norm": 2.4878473813549675, "language_loss": 0.92509401, "learning_rate": 3.871943634189376e-06, "loss": 0.94899511, "num_input_tokens_seen": 8546455, "step": 409, "time_per_iteration": 2.665321111679077 }, { "auxiliary_loss_clip": 0.01296394, "auxiliary_loss_mlp": 0.01077538, "balance_loss_clip": 1.08126342, "balance_loss_mlp": 1.04291987, "epoch": 0.02465053359386743, "flos": 35114782124160.0, "grad_norm": 2.2521095969191722, "language_loss": 0.82792604, "learning_rate": 3.873515923575128e-06, "loss": 0.85166532, "num_input_tokens_seen": 8568450, "step": 410, "time_per_iteration": 2.848928213119507 }, { "auxiliary_loss_clip": 0.01299459, "auxiliary_loss_mlp": 0.01089133, "balance_loss_clip": 1.08187068, "balance_loss_mlp": 1.05284572, "epoch": 0.0247106568465354, "flos": 27451409333760.0, "grad_norm": 2.1393760271628595, "language_loss": 0.77577484, "learning_rate": 3.875084382775879e-06, "loss": 0.79966074, "num_input_tokens_seen": 8589340, "step": 411, "time_per_iteration": 2.6645278930664062 }, { "auxiliary_loss_clip": 0.01298341, "auxiliary_loss_mlp": 0.0110154, "balance_loss_clip": 1.07977521, "balance_loss_mlp": 1.06289268, "epoch": 0.024770780099203367, "flos": 20703794808960.0, "grad_norm": 2.2974658872162665, "language_loss": 0.86379063, "learning_rate": 3.87664903040738e-06, "loss": 0.88778943, "num_input_tokens_seen": 8607150, "step": 412, "time_per_iteration": 2.6091151237487793 }, { "auxiliary_loss_clip": 0.01187014, "auxiliary_loss_mlp": 0.01031436, "balance_loss_clip": 1.07387948, "balance_loss_mlp": 1.02089787, "epoch": 0.024830903351871336, "flos": 69551859369600.0, "grad_norm": 0.8687159185244209, "language_loss": 0.5852263, "learning_rate": 3.878209884949994e-06, "loss": 0.60741079, "num_input_tokens_seen": 8669865, "step": 413, "time_per_iteration": 3.2269625663757324 }, { "auxiliary_loss_clip": 0.0129043, "auxiliary_loss_mlp": 0.01091958, "balance_loss_clip": 1.07709181, "balance_loss_mlp": 1.05249953, "epoch": 0.024891026604539304, "flos": 32270477713920.0, "grad_norm": 1.8280666153990437, "language_loss": 0.80517173, "learning_rate": 3.879766964750006e-06, "loss": 0.82899559, "num_input_tokens_seen": 8690235, "step": 414, "time_per_iteration": 2.720341444015503 }, { "auxiliary_loss_clip": 0.01287097, "auxiliary_loss_mlp": 0.0109242, "balance_loss_clip": 1.0756042, "balance_loss_mlp": 1.0556556, "epoch": 0.024951149857207276, "flos": 18840282238080.0, "grad_norm": 2.1921003994701302, "language_loss": 0.80227423, "learning_rate": 3.881320288020917e-06, "loss": 0.82606936, "num_input_tokens_seen": 8706295, "step": 415, "time_per_iteration": 2.6473400592803955 }, { "auxiliary_loss_clip": 0.01302694, "auxiliary_loss_mlp": 0.01082455, "balance_loss_clip": 1.08156919, "balance_loss_mlp": 1.04497528, "epoch": 0.025011273109875245, "flos": 15377201210880.0, "grad_norm": 2.9318871737289776, "language_loss": 0.96236515, "learning_rate": 3.882869872844723e-06, "loss": 0.9862166, "num_input_tokens_seen": 8724200, "step": 416, "time_per_iteration": 2.596189260482788 }, { "auxiliary_loss_clip": 0.01291636, "auxiliary_loss_mlp": 0.01074465, "balance_loss_clip": 1.07628798, "balance_loss_mlp": 1.0355792, "epoch": 0.025071396362543213, "flos": 18915515274240.0, "grad_norm": 1.741746736079687, "language_loss": 0.77381694, "learning_rate": 3.884415737173176e-06, "loss": 0.79747796, "num_input_tokens_seen": 8744170, "step": 417, "time_per_iteration": 5.610344171524048 }, { "auxiliary_loss_clip": 0.01290746, "auxiliary_loss_mlp": 0.0109022, "balance_loss_clip": 1.08072221, "balance_loss_mlp": 1.05264485, "epoch": 0.025131519615211182, "flos": 25337958952320.0, "grad_norm": 1.554385639735456, "language_loss": 0.77076226, "learning_rate": 3.8859578988290344e-06, "loss": 0.79457194, "num_input_tokens_seen": 8765120, "step": 418, "time_per_iteration": 5.837290525436401 }, { "auxiliary_loss_clip": 0.01297026, "auxiliary_loss_mlp": 0.01071197, "balance_loss_clip": 1.08019948, "balance_loss_mlp": 1.03550553, "epoch": 0.02519164286787915, "flos": 18953149749120.0, "grad_norm": 2.4603268634516207, "language_loss": 0.81445098, "learning_rate": 3.887496375507294e-06, "loss": 0.83813322, "num_input_tokens_seen": 8783500, "step": 419, "time_per_iteration": 2.582590341567993 }, { "auxiliary_loss_clip": 0.01291114, "auxiliary_loss_mlp": 0.01086736, "balance_loss_clip": 1.07929599, "balance_loss_mlp": 1.04708743, "epoch": 0.025251766120547123, "flos": 17421092904960.0, "grad_norm": 1.8078532084212713, "language_loss": 0.73618573, "learning_rate": 3.8890311847764065e-06, "loss": 0.75996423, "num_input_tokens_seen": 8801175, "step": 420, "time_per_iteration": 2.6739418506622314 }, { "auxiliary_loss_clip": 0.01290485, "auxiliary_loss_mlp": 0.01096292, "balance_loss_clip": 1.07605243, "balance_loss_mlp": 1.05924153, "epoch": 0.02531188937321509, "flos": 25045430590080.0, "grad_norm": 1.77336014903074, "language_loss": 0.79040134, "learning_rate": 3.890562344079484e-06, "loss": 0.81426907, "num_input_tokens_seen": 8820215, "step": 421, "time_per_iteration": 2.6928632259368896 }, { "auxiliary_loss_clip": 0.01290689, "auxiliary_loss_mlp": 0.01088863, "balance_loss_clip": 1.07922924, "balance_loss_mlp": 1.04983425, "epoch": 0.02537201262588306, "flos": 30592228515840.0, "grad_norm": 2.2139016136437104, "language_loss": 0.8203755, "learning_rate": 3.89208987073549e-06, "loss": 0.84417105, "num_input_tokens_seen": 8839660, "step": 422, "time_per_iteration": 2.714707851409912 }, { "auxiliary_loss_clip": 0.01293659, "auxiliary_loss_mlp": 0.01078975, "balance_loss_clip": 1.07677865, "balance_loss_mlp": 1.04430926, "epoch": 0.02543213587855103, "flos": 26065365275520.0, "grad_norm": 2.1259138778576356, "language_loss": 0.83458018, "learning_rate": 3.893613781940409e-06, "loss": 0.85830647, "num_input_tokens_seen": 8859280, "step": 423, "time_per_iteration": 2.652757167816162 }, { "auxiliary_loss_clip": 0.01287497, "auxiliary_loss_mlp": 0.01078335, "balance_loss_clip": 1.0742569, "balance_loss_mlp": 1.04221487, "epoch": 0.025492259131218997, "flos": 36022818965760.0, "grad_norm": 2.012741083661608, "language_loss": 0.74129444, "learning_rate": 3.895134094768415e-06, "loss": 0.76495278, "num_input_tokens_seen": 8880560, "step": 424, "time_per_iteration": 2.7724521160125732 }, { "auxiliary_loss_clip": 0.01296446, "auxiliary_loss_mlp": 0.01093799, "balance_loss_clip": 1.07987142, "balance_loss_mlp": 1.05782199, "epoch": 0.02555238238388697, "flos": 18588045957120.0, "grad_norm": 4.623670538116741, "language_loss": 0.83193713, "learning_rate": 3.896650826173015e-06, "loss": 0.85583955, "num_input_tokens_seen": 8899155, "step": 425, "time_per_iteration": 2.608029842376709 }, { "auxiliary_loss_clip": 0.01292462, "auxiliary_loss_mlp": 0.01092376, "balance_loss_clip": 1.07259536, "balance_loss_mlp": 1.0544672, "epoch": 0.025612505636554938, "flos": 24243186280320.0, "grad_norm": 2.5075767706443566, "language_loss": 0.853073, "learning_rate": 3.898163992988186e-06, "loss": 0.87692136, "num_input_tokens_seen": 8917890, "step": 426, "time_per_iteration": 2.6445271968841553 }, { "auxiliary_loss_clip": 0.01175923, "auxiliary_loss_mlp": 0.01017688, "balance_loss_clip": 1.06532824, "balance_loss_mlp": 1.00781715, "epoch": 0.025672628889222907, "flos": 60586941265920.0, "grad_norm": 0.8949637292547264, "language_loss": 0.57219732, "learning_rate": 3.899673611929491e-06, "loss": 0.5941335, "num_input_tokens_seen": 8978260, "step": 427, "time_per_iteration": 3.2690517902374268 }, { "auxiliary_loss_clip": 0.01291989, "auxiliary_loss_mlp": 0.01092649, "balance_loss_clip": 1.08155811, "balance_loss_mlp": 1.05674267, "epoch": 0.025732752141890875, "flos": 19573255169280.0, "grad_norm": 2.4869215225306673, "language_loss": 0.88130605, "learning_rate": 3.901179699595194e-06, "loss": 0.90515244, "num_input_tokens_seen": 8994460, "step": 428, "time_per_iteration": 2.6143813133239746 }, { "auxiliary_loss_clip": 0.01283603, "auxiliary_loss_mlp": 0.0107531, "balance_loss_clip": 1.07418942, "balance_loss_mlp": 1.03735399, "epoch": 0.025792875394558847, "flos": 31284262920960.0, "grad_norm": 2.067247304638145, "language_loss": 0.85790849, "learning_rate": 3.902682272467353e-06, "loss": 0.88149762, "num_input_tokens_seen": 9016670, "step": 429, "time_per_iteration": 2.749328374862671 }, { "auxiliary_loss_clip": 0.01288943, "auxiliary_loss_mlp": 0.01083888, "balance_loss_clip": 1.07337689, "balance_loss_mlp": 1.04590786, "epoch": 0.025852998647226816, "flos": 32379610210560.0, "grad_norm": 2.4411876712444034, "language_loss": 0.8815223, "learning_rate": 3.904181346912895e-06, "loss": 0.90525061, "num_input_tokens_seen": 9039720, "step": 430, "time_per_iteration": 2.7483572959899902 }, { "auxiliary_loss_clip": 0.01290726, "auxiliary_loss_mlp": 0.01080495, "balance_loss_clip": 1.0803287, "balance_loss_mlp": 1.04573333, "epoch": 0.025913121899894784, "flos": 20193288762240.0, "grad_norm": 2.086180078538185, "language_loss": 0.84249514, "learning_rate": 3.905676939184698e-06, "loss": 0.8662073, "num_input_tokens_seen": 9059850, "step": 431, "time_per_iteration": 2.6531126499176025 }, { "auxiliary_loss_clip": 0.01286945, "auxiliary_loss_mlp": 0.01073345, "balance_loss_clip": 1.07570636, "balance_loss_mlp": 1.03951311, "epoch": 0.025973245152562753, "flos": 14720430983040.0, "grad_norm": 2.681931959502968, "language_loss": 0.86511916, "learning_rate": 3.907169065422638e-06, "loss": 0.88872206, "num_input_tokens_seen": 9077590, "step": 432, "time_per_iteration": 2.7582762241363525 }, { "auxiliary_loss_clip": 0.01287429, "auxiliary_loss_mlp": 0.01072961, "balance_loss_clip": 1.07632601, "balance_loss_mlp": 1.03891492, "epoch": 0.02603336840523072, "flos": 30992991534720.0, "grad_norm": 1.95596969308187, "language_loss": 0.76036298, "learning_rate": 3.908657741654636e-06, "loss": 0.7839669, "num_input_tokens_seen": 9099880, "step": 433, "time_per_iteration": 2.707771062850952 }, { "auxiliary_loss_clip": 0.01289436, "auxiliary_loss_mlp": 0.01088504, "balance_loss_clip": 1.07470191, "balance_loss_mlp": 1.04973757, "epoch": 0.026093491657898694, "flos": 17674262939520.0, "grad_norm": 2.157056093147959, "language_loss": 0.8979522, "learning_rate": 3.910142983797699e-06, "loss": 0.92173159, "num_input_tokens_seen": 9118620, "step": 434, "time_per_iteration": 2.5665409564971924 }, { "auxiliary_loss_clip": 0.01289617, "auxiliary_loss_mlp": 0.01096405, "balance_loss_clip": 1.07960439, "balance_loss_mlp": 1.05904448, "epoch": 0.026153614910566662, "flos": 17857874286720.0, "grad_norm": 2.306071945033866, "language_loss": 0.80187833, "learning_rate": 3.9116248076589305e-06, "loss": 0.82573849, "num_input_tokens_seen": 9135655, "step": 435, "time_per_iteration": 2.614440679550171 }, { "auxiliary_loss_clip": 0.01285396, "auxiliary_loss_mlp": 0.01092207, "balance_loss_clip": 1.07367229, "balance_loss_mlp": 1.05503798, "epoch": 0.02621373816323463, "flos": 20011113959040.0, "grad_norm": 3.0257040949539356, "language_loss": 0.86361396, "learning_rate": 3.913103228936546e-06, "loss": 0.88739002, "num_input_tokens_seen": 9153520, "step": 436, "time_per_iteration": 2.635033130645752 }, { "auxiliary_loss_clip": 0.01289558, "auxiliary_loss_mlp": 0.01096903, "balance_loss_clip": 1.07716811, "balance_loss_mlp": 1.06080687, "epoch": 0.0262738614159026, "flos": 19281193683840.0, "grad_norm": 2.4233286399217993, "language_loss": 0.74725163, "learning_rate": 3.914578263220868e-06, "loss": 0.77111626, "num_input_tokens_seen": 9170750, "step": 437, "time_per_iteration": 2.6614880561828613 }, { "auxiliary_loss_clip": 0.01286403, "auxiliary_loss_mlp": 0.01100399, "balance_loss_clip": 1.07628679, "balance_loss_mlp": 1.06220388, "epoch": 0.026333984668570568, "flos": 18807208790400.0, "grad_norm": 2.79370908187484, "language_loss": 0.9131338, "learning_rate": 3.916049925995316e-06, "loss": 0.93700182, "num_input_tokens_seen": 9188430, "step": 438, "time_per_iteration": 2.674877166748047 }, { "auxiliary_loss_clip": 0.01169678, "auxiliary_loss_mlp": 0.01072518, "balance_loss_clip": 1.0602653, "balance_loss_mlp": 1.06250465, "epoch": 0.02639410792123854, "flos": 64572020691840.0, "grad_norm": 0.8871275810137318, "language_loss": 0.62631273, "learning_rate": 3.917518232637377e-06, "loss": 0.64873469, "num_input_tokens_seen": 9255835, "step": 439, "time_per_iteration": 3.2527849674224854 }, { "auxiliary_loss_clip": 0.01296492, "auxiliary_loss_mlp": 0.01095184, "balance_loss_clip": 1.08175814, "balance_loss_mlp": 1.05758572, "epoch": 0.02645423117390651, "flos": 28473462921600.0, "grad_norm": 3.31985956061953, "language_loss": 0.75982475, "learning_rate": 3.918983198419573e-06, "loss": 0.78374153, "num_input_tokens_seen": 9276835, "step": 440, "time_per_iteration": 2.6770262718200684 }, { "auxiliary_loss_clip": 0.01286342, "auxiliary_loss_mlp": 0.01076505, "balance_loss_clip": 1.07652593, "balance_loss_mlp": 1.04048026, "epoch": 0.026514354426574478, "flos": 18551237495040.0, "grad_norm": 3.0236705091068283, "language_loss": 0.83197021, "learning_rate": 3.920444838510415e-06, "loss": 0.85559869, "num_input_tokens_seen": 9295075, "step": 441, "time_per_iteration": 2.591306209564209 }, { "auxiliary_loss_clip": 0.01291817, "auxiliary_loss_mlp": 0.01086154, "balance_loss_clip": 1.07703269, "balance_loss_mlp": 1.04829359, "epoch": 0.026574477679242446, "flos": 20667812359680.0, "grad_norm": 2.202684635319811, "language_loss": 0.78490162, "learning_rate": 3.92190316797534e-06, "loss": 0.80868137, "num_input_tokens_seen": 9314205, "step": 442, "time_per_iteration": 2.633054733276367 }, { "auxiliary_loss_clip": 0.0116251, "auxiliary_loss_mlp": 0.01015158, "balance_loss_clip": 1.05336332, "balance_loss_mlp": 1.0054301, "epoch": 0.026634600931910415, "flos": 57956125340160.0, "grad_norm": 0.9609264438471399, "language_loss": 0.64459753, "learning_rate": 3.92335820177765e-06, "loss": 0.66637421, "num_input_tokens_seen": 9367395, "step": 443, "time_per_iteration": 3.1241400241851807 }, { "auxiliary_loss_clip": 0.01291897, "auxiliary_loss_mlp": 0.01085882, "balance_loss_clip": 1.08147204, "balance_loss_mlp": 1.04906964, "epoch": 0.026694724184578387, "flos": 15815131827840.0, "grad_norm": 2.121488874389134, "language_loss": 0.82093638, "learning_rate": 3.924809954779425e-06, "loss": 0.84471416, "num_input_tokens_seen": 9385185, "step": 444, "time_per_iteration": 2.6202428340911865 }, { "auxiliary_loss_clip": 0.0129406, "auxiliary_loss_mlp": 0.01082041, "balance_loss_clip": 1.07940578, "balance_loss_mlp": 1.04263067, "epoch": 0.026754847437246355, "flos": 23440259612160.0, "grad_norm": 2.2213674770888607, "language_loss": 0.95689106, "learning_rate": 3.9262584417424425e-06, "loss": 0.98065209, "num_input_tokens_seen": 9403225, "step": 445, "time_per_iteration": 2.6071228981018066 }, { "auxiliary_loss_clip": 0.01289866, "auxiliary_loss_mlp": 0.01094053, "balance_loss_clip": 1.07953668, "balance_loss_mlp": 1.05492878, "epoch": 0.026814970689914324, "flos": 17341801632000.0, "grad_norm": 2.775359545549618, "language_loss": 0.91932094, "learning_rate": 3.9277036773290725e-06, "loss": 0.94316012, "num_input_tokens_seen": 9420540, "step": 446, "time_per_iteration": 2.5791916847229004 }, { "auxiliary_loss_clip": 0.01289847, "auxiliary_loss_mlp": 0.01088114, "balance_loss_clip": 1.08072042, "balance_loss_mlp": 1.05092025, "epoch": 0.026875093942582293, "flos": 17894718662400.0, "grad_norm": 2.0562763127679204, "language_loss": 0.79831308, "learning_rate": 3.92914567610317e-06, "loss": 0.82209271, "num_input_tokens_seen": 9438840, "step": 447, "time_per_iteration": 2.6420843601226807 }, { "auxiliary_loss_clip": 0.01289397, "auxiliary_loss_mlp": 0.01079607, "balance_loss_clip": 1.07901013, "balance_loss_mlp": 1.04446411, "epoch": 0.026935217195250265, "flos": 21723980889600.0, "grad_norm": 2.231264914467203, "language_loss": 0.86402845, "learning_rate": 3.930584452530952e-06, "loss": 0.8877185, "num_input_tokens_seen": 9457215, "step": 448, "time_per_iteration": 2.590277910232544 }, { "auxiliary_loss_clip": 0.01282455, "auxiliary_loss_mlp": 0.01091099, "balance_loss_clip": 1.07706833, "balance_loss_mlp": 1.05662322, "epoch": 0.026995340447918233, "flos": 23622685810560.0, "grad_norm": 1.941778256808524, "language_loss": 0.88581634, "learning_rate": 3.9320200209818755e-06, "loss": 0.90955186, "num_input_tokens_seen": 9475615, "step": 449, "time_per_iteration": 2.610065460205078 }, { "auxiliary_loss_clip": 0.01293472, "auxiliary_loss_mlp": 0.01085576, "balance_loss_clip": 1.07856452, "balance_loss_mlp": 1.04814398, "epoch": 0.027055463700586202, "flos": 17931275729280.0, "grad_norm": 2.199007921978797, "language_loss": 0.80395782, "learning_rate": 3.933452395729493e-06, "loss": 0.8277483, "num_input_tokens_seen": 9493975, "step": 450, "time_per_iteration": 2.637465238571167 }, { "auxiliary_loss_clip": 0.01284612, "auxiliary_loss_mlp": 0.0108001, "balance_loss_clip": 1.08025336, "balance_loss_mlp": 1.04384232, "epoch": 0.02711558695325417, "flos": 25118903859840.0, "grad_norm": 1.599374223212879, "language_loss": 0.81562543, "learning_rate": 3.934881590952304e-06, "loss": 0.83927161, "num_input_tokens_seen": 9514810, "step": 451, "time_per_iteration": 2.6506927013397217 }, { "auxiliary_loss_clip": 0.0128567, "auxiliary_loss_mlp": 0.01090719, "balance_loss_clip": 1.08126068, "balance_loss_mlp": 1.0533824, "epoch": 0.02717571020592214, "flos": 24239559006720.0, "grad_norm": 1.9677929562692107, "language_loss": 0.77019048, "learning_rate": 3.936307620734599e-06, "loss": 0.79395437, "num_input_tokens_seen": 9533635, "step": 452, "time_per_iteration": 2.5751442909240723 }, { "auxiliary_loss_clip": 0.01286865, "auxiliary_loss_mlp": 0.01088287, "balance_loss_clip": 1.08011293, "balance_loss_mlp": 1.05135596, "epoch": 0.02723583345859011, "flos": 25118939773440.0, "grad_norm": 1.7205362750177517, "language_loss": 0.72874546, "learning_rate": 3.937730499067294e-06, "loss": 0.75249696, "num_input_tokens_seen": 9555420, "step": 453, "time_per_iteration": 2.668083667755127 }, { "auxiliary_loss_clip": 0.01281405, "auxiliary_loss_mlp": 0.01083223, "balance_loss_clip": 1.07714963, "balance_loss_mlp": 1.04748416, "epoch": 0.02729595671125808, "flos": 42741597847680.0, "grad_norm": 1.8353680194819204, "language_loss": 0.82419729, "learning_rate": 3.939150239848748e-06, "loss": 0.84784359, "num_input_tokens_seen": 9578950, "step": 454, "time_per_iteration": 2.8580126762390137 }, { "auxiliary_loss_clip": 0.01285525, "auxiliary_loss_mlp": 0.01077241, "balance_loss_clip": 1.07935429, "balance_loss_mlp": 1.043648, "epoch": 0.02735607996392605, "flos": 21430985650560.0, "grad_norm": 1.985829769195046, "language_loss": 0.75404847, "learning_rate": 3.9405668568855866e-06, "loss": 0.77767611, "num_input_tokens_seen": 9598160, "step": 455, "time_per_iteration": 2.6593477725982666 }, { "auxiliary_loss_clip": 0.01282853, "auxiliary_loss_mlp": 0.01094959, "balance_loss_clip": 1.07477236, "balance_loss_mlp": 1.0597918, "epoch": 0.027416203216594017, "flos": 20851280052480.0, "grad_norm": 1.92483069519606, "language_loss": 0.80670613, "learning_rate": 3.941980363893499e-06, "loss": 0.83048427, "num_input_tokens_seen": 9616010, "step": 456, "time_per_iteration": 2.6798384189605713 }, { "auxiliary_loss_clip": 0.01280135, "auxiliary_loss_mlp": 0.01080319, "balance_loss_clip": 1.07714963, "balance_loss_mlp": 1.0435549, "epoch": 0.027476326469261986, "flos": 13224500242560.0, "grad_norm": 2.171481572134165, "language_loss": 0.81587321, "learning_rate": 3.9433907744980384e-06, "loss": 0.83947778, "num_input_tokens_seen": 9634000, "step": 457, "time_per_iteration": 5.62308406829834 }, { "auxiliary_loss_clip": 0.01283922, "auxiliary_loss_mlp": 0.01084055, "balance_loss_clip": 1.07603848, "balance_loss_mlp": 1.04891229, "epoch": 0.027536449721929958, "flos": 24024526237440.0, "grad_norm": 2.024184269172234, "language_loss": 0.94030929, "learning_rate": 3.944798102235412e-06, "loss": 0.96398914, "num_input_tokens_seen": 9653455, "step": 458, "time_per_iteration": 5.694372653961182 }, { "auxiliary_loss_clip": 0.01280807, "auxiliary_loss_mlp": 0.01091426, "balance_loss_clip": 1.07479525, "balance_loss_mlp": 1.05666471, "epoch": 0.027596572974597926, "flos": 13006055681280.0, "grad_norm": 2.356061876390436, "language_loss": 0.79279089, "learning_rate": 3.9462023605532545e-06, "loss": 0.81651318, "num_input_tokens_seen": 9669650, "step": 459, "time_per_iteration": 2.626948595046997 }, { "auxiliary_loss_clip": 0.01286253, "auxiliary_loss_mlp": 0.01081623, "balance_loss_clip": 1.08119941, "balance_loss_mlp": 1.04278445, "epoch": 0.027656696227265895, "flos": 26143076350080.0, "grad_norm": 2.0583603779546404, "language_loss": 0.83362132, "learning_rate": 3.947603562811407e-06, "loss": 0.85730016, "num_input_tokens_seen": 9691415, "step": 460, "time_per_iteration": 2.7191598415374756 }, { "auxiliary_loss_clip": 0.01158037, "auxiliary_loss_mlp": 0.01054463, "balance_loss_clip": 1.05032754, "balance_loss_mlp": 1.044402, "epoch": 0.027716819479933864, "flos": 60697222997760.0, "grad_norm": 1.612511499168885, "language_loss": 0.7351321, "learning_rate": 3.949001722282675e-06, "loss": 0.7572571, "num_input_tokens_seen": 9755605, "step": 461, "time_per_iteration": 3.210820436477661 }, { "auxiliary_loss_clip": 0.01284234, "auxiliary_loss_mlp": 0.01079832, "balance_loss_clip": 1.08432341, "balance_loss_mlp": 1.04700136, "epoch": 0.027776942732601832, "flos": 31211938886400.0, "grad_norm": 2.4500038571081073, "language_loss": 0.81596625, "learning_rate": 3.950396852153582e-06, "loss": 0.839607, "num_input_tokens_seen": 9776270, "step": 462, "time_per_iteration": 2.683197021484375 }, { "auxiliary_loss_clip": 0.01280414, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.07752454, "balance_loss_mlp": 1.0454762, "epoch": 0.027837065985269804, "flos": 22674644196480.0, "grad_norm": 2.258526594266715, "language_loss": 0.90062451, "learning_rate": 3.951788965525118e-06, "loss": 0.92421508, "num_input_tokens_seen": 9794465, "step": 463, "time_per_iteration": 2.641674757003784 }, { "auxiliary_loss_clip": 0.01151842, "auxiliary_loss_mlp": 0.01010002, "balance_loss_clip": 1.04755902, "balance_loss_mlp": 1.00027454, "epoch": 0.027897189237937773, "flos": 62182487399040.0, "grad_norm": 0.8962796480673014, "language_loss": 0.59058654, "learning_rate": 3.953178075413476e-06, "loss": 0.61220491, "num_input_tokens_seen": 9849685, "step": 464, "time_per_iteration": 3.1129612922668457 }, { "auxiliary_loss_clip": 0.01292933, "auxiliary_loss_mlp": 0.01100533, "balance_loss_clip": 1.08296049, "balance_loss_mlp": 1.06412649, "epoch": 0.02795731249060574, "flos": 24493160004480.0, "grad_norm": 2.3712654859298055, "language_loss": 0.81454253, "learning_rate": 3.954564194750784e-06, "loss": 0.83847719, "num_input_tokens_seen": 9869505, "step": 465, "time_per_iteration": 2.723144769668579 }, { "auxiliary_loss_clip": 0.01279938, "auxiliary_loss_mlp": 0.01092668, "balance_loss_clip": 1.07546401, "balance_loss_mlp": 1.05630863, "epoch": 0.02801743574327371, "flos": 23733003456000.0, "grad_norm": 1.9968224423519798, "language_loss": 0.78396618, "learning_rate": 3.955947336385828e-06, "loss": 0.80769229, "num_input_tokens_seen": 9890950, "step": 466, "time_per_iteration": 2.6278555393218994 }, { "auxiliary_loss_clip": 0.0127853, "auxiliary_loss_mlp": 0.01091802, "balance_loss_clip": 1.07703936, "balance_loss_mlp": 1.05661178, "epoch": 0.02807755899594168, "flos": 20629100476800.0, "grad_norm": 2.010021605622182, "language_loss": 0.87699366, "learning_rate": 3.957327513084761e-06, "loss": 0.90069699, "num_input_tokens_seen": 9911265, "step": 467, "time_per_iteration": 2.6687490940093994 }, { "auxiliary_loss_clip": 0.01285129, "auxiliary_loss_mlp": 0.01112935, "balance_loss_clip": 1.07874036, "balance_loss_mlp": 1.07576585, "epoch": 0.02813768224860965, "flos": 19244564789760.0, "grad_norm": 2.2302958424490416, "language_loss": 0.86091757, "learning_rate": 3.958704737531818e-06, "loss": 0.88489819, "num_input_tokens_seen": 9929025, "step": 468, "time_per_iteration": 2.5745644569396973 }, { "auxiliary_loss_clip": 0.01281128, "auxiliary_loss_mlp": 0.01085455, "balance_loss_clip": 1.07529211, "balance_loss_mlp": 1.04857147, "epoch": 0.02819780550127762, "flos": 20813968800000.0, "grad_norm": 2.1866562002509875, "language_loss": 0.91690558, "learning_rate": 3.9600790223300065e-06, "loss": 0.94057143, "num_input_tokens_seen": 9945190, "step": 469, "time_per_iteration": 2.610821008682251 }, { "auxiliary_loss_clip": 0.0127909, "auxiliary_loss_mlp": 0.0110095, "balance_loss_clip": 1.07675052, "balance_loss_mlp": 1.06482995, "epoch": 0.028257928753945588, "flos": 19974125928960.0, "grad_norm": 2.674428223667968, "language_loss": 0.81758964, "learning_rate": 3.96145038000181e-06, "loss": 0.84139001, "num_input_tokens_seen": 9962820, "step": 470, "time_per_iteration": 2.6004326343536377 }, { "auxiliary_loss_clip": 0.0128074, "auxiliary_loss_mlp": 0.01086643, "balance_loss_clip": 1.07482624, "balance_loss_mlp": 1.04947352, "epoch": 0.028318052006613557, "flos": 20484488321280.0, "grad_norm": 1.788793606991614, "language_loss": 0.93071401, "learning_rate": 3.962818822989861e-06, "loss": 0.95438784, "num_input_tokens_seen": 9982595, "step": 471, "time_per_iteration": 2.556288719177246 }, { "auxiliary_loss_clip": 0.01273697, "auxiliary_loss_mlp": 0.0110454, "balance_loss_clip": 1.07223165, "balance_loss_mlp": 1.06884849, "epoch": 0.02837817525928153, "flos": 28514832410880.0, "grad_norm": 1.8550872135639116, "language_loss": 0.7613501, "learning_rate": 3.964184363657625e-06, "loss": 0.78513247, "num_input_tokens_seen": 10004645, "step": 472, "time_per_iteration": 2.667804002761841 }, { "auxiliary_loss_clip": 0.01280341, "auxiliary_loss_mlp": 0.01090649, "balance_loss_clip": 1.07279634, "balance_loss_mlp": 1.05624473, "epoch": 0.028438298511949497, "flos": 18551668458240.0, "grad_norm": 1.9914661475951314, "language_loss": 0.93097353, "learning_rate": 3.965547014290071e-06, "loss": 0.95468336, "num_input_tokens_seen": 10022555, "step": 473, "time_per_iteration": 2.6402342319488525 }, { "auxiliary_loss_clip": 0.01287339, "auxiliary_loss_mlp": 0.01124194, "balance_loss_clip": 1.07773685, "balance_loss_mlp": 1.08979011, "epoch": 0.028498421764617466, "flos": 16910227722240.0, "grad_norm": 3.2560638787193237, "language_loss": 0.88488632, "learning_rate": 3.96690678709433e-06, "loss": 0.90900171, "num_input_tokens_seen": 10041025, "step": 474, "time_per_iteration": 2.5853888988494873 }, { "auxiliary_loss_clip": 0.0127783, "auxiliary_loss_mlp": 0.01093132, "balance_loss_clip": 1.07535374, "balance_loss_mlp": 1.05620146, "epoch": 0.028558545017285435, "flos": 27778699082880.0, "grad_norm": 3.1427023167402006, "language_loss": 0.78901398, "learning_rate": 3.968263694200355e-06, "loss": 0.81272364, "num_input_tokens_seen": 10060775, "step": 475, "time_per_iteration": 2.654519557952881 }, { "auxiliary_loss_clip": 0.01148107, "auxiliary_loss_mlp": 0.01095224, "balance_loss_clip": 1.04505777, "balance_loss_mlp": 1.08583021, "epoch": 0.028618668269953403, "flos": 65654367258240.0, "grad_norm": 0.9280065830162254, "language_loss": 0.66926932, "learning_rate": 3.969617747661569e-06, "loss": 0.6917026, "num_input_tokens_seen": 10120225, "step": 476, "time_per_iteration": 3.1292569637298584 }, { "auxiliary_loss_clip": 0.01279748, "auxiliary_loss_mlp": 0.01088794, "balance_loss_clip": 1.07638311, "balance_loss_mlp": 1.05188656, "epoch": 0.028678791522621375, "flos": 21937074324480.0, "grad_norm": 2.985672001195028, "language_loss": 0.83807188, "learning_rate": 3.970968959455509e-06, "loss": 0.86175728, "num_input_tokens_seen": 10137880, "step": 477, "time_per_iteration": 2.651493549346924 }, { "auxiliary_loss_clip": 0.01284956, "auxiliary_loss_mlp": 0.0108711, "balance_loss_clip": 1.07924342, "balance_loss_mlp": 1.05089426, "epoch": 0.028738914775289344, "flos": 24572128055040.0, "grad_norm": 2.1929055744411943, "language_loss": 0.8233152, "learning_rate": 3.97231734148446e-06, "loss": 0.84703588, "num_input_tokens_seen": 10156930, "step": 478, "time_per_iteration": 2.6986753940582275 }, { "auxiliary_loss_clip": 0.01277687, "auxiliary_loss_mlp": 0.01080644, "balance_loss_clip": 1.07448888, "balance_loss_mlp": 1.04500043, "epoch": 0.028799038027957313, "flos": 23257977068160.0, "grad_norm": 4.057107988717453, "language_loss": 0.81195259, "learning_rate": 3.973662905576082e-06, "loss": 0.83553594, "num_input_tokens_seen": 10176295, "step": 479, "time_per_iteration": 2.6321041584014893 }, { "auxiliary_loss_clip": 0.01273765, "auxiliary_loss_mlp": 0.01083313, "balance_loss_clip": 1.07335579, "balance_loss_mlp": 1.04552341, "epoch": 0.02885916128062528, "flos": 22164102236160.0, "grad_norm": 2.352225573775279, "language_loss": 0.7335608, "learning_rate": 3.975005663484038e-06, "loss": 0.75713164, "num_input_tokens_seen": 10195790, "step": 480, "time_per_iteration": 2.650696277618408 }, { "auxiliary_loss_clip": 0.01273107, "auxiliary_loss_mlp": 0.01075586, "balance_loss_clip": 1.07424879, "balance_loss_mlp": 1.04277968, "epoch": 0.02891928453329325, "flos": 22932842135040.0, "grad_norm": 1.867890428108999, "language_loss": 0.87560165, "learning_rate": 3.976345626888605e-06, "loss": 0.89908862, "num_input_tokens_seen": 10218405, "step": 481, "time_per_iteration": 2.6585533618927 }, { "auxiliary_loss_clip": 0.01142103, "auxiliary_loss_mlp": 0.01017301, "balance_loss_clip": 1.04286921, "balance_loss_mlp": 1.00895679, "epoch": 0.028979407785961222, "flos": 57432941792640.0, "grad_norm": 0.8486437303263991, "language_loss": 0.66030192, "learning_rate": 3.9776828073972864e-06, "loss": 0.68189597, "num_input_tokens_seen": 10271005, "step": 482, "time_per_iteration": 2.9788918495178223 }, { "auxiliary_loss_clip": 0.01287904, "auxiliary_loss_mlp": 0.01082416, "balance_loss_clip": 1.07739437, "balance_loss_mlp": 1.04868007, "epoch": 0.02903953103862919, "flos": 16722737706240.0, "grad_norm": 2.6473263724689873, "language_loss": 0.7899214, "learning_rate": 3.979017216545415e-06, "loss": 0.81362462, "num_input_tokens_seen": 10288405, "step": 483, "time_per_iteration": 2.5642752647399902 }, { "auxiliary_loss_clip": 0.01283775, "auxiliary_loss_mlp": 0.01097438, "balance_loss_clip": 1.07794189, "balance_loss_mlp": 1.06155562, "epoch": 0.02909965429129716, "flos": 16763640318720.0, "grad_norm": 2.6777328906555766, "language_loss": 0.75510043, "learning_rate": 3.980348865796749e-06, "loss": 0.77891254, "num_input_tokens_seen": 10306875, "step": 484, "time_per_iteration": 2.608337640762329 }, { "auxiliary_loss_clip": 0.0127962, "auxiliary_loss_mlp": 0.01081582, "balance_loss_clip": 1.07543373, "balance_loss_mlp": 1.04760778, "epoch": 0.029159777543965128, "flos": 19785343023360.0, "grad_norm": 2.3457282915841113, "language_loss": 0.8378315, "learning_rate": 3.9816777665440615e-06, "loss": 0.86144352, "num_input_tokens_seen": 10323965, "step": 485, "time_per_iteration": 2.591409921646118 }, { "auxiliary_loss_clip": 0.01282377, "auxiliary_loss_mlp": 0.01084922, "balance_loss_clip": 1.08029485, "balance_loss_mlp": 1.04956484, "epoch": 0.029219900796633096, "flos": 19642670202240.0, "grad_norm": 2.044831141886674, "language_loss": 0.84432101, "learning_rate": 3.983003930109732e-06, "loss": 0.86799401, "num_input_tokens_seen": 10342620, "step": 486, "time_per_iteration": 2.7101452350616455 }, { "auxiliary_loss_clip": 0.01276806, "auxiliary_loss_mlp": 0.01090739, "balance_loss_clip": 1.07363296, "balance_loss_mlp": 1.05476189, "epoch": 0.02928002404930107, "flos": 25885704424320.0, "grad_norm": 12.432525192672303, "language_loss": 0.88968349, "learning_rate": 3.984327367746315e-06, "loss": 0.91335887, "num_input_tokens_seen": 10364610, "step": 487, "time_per_iteration": 2.637910842895508 }, { "auxiliary_loss_clip": 0.01283084, "auxiliary_loss_mlp": 0.01069223, "balance_loss_clip": 1.07921362, "balance_loss_mlp": 1.03677416, "epoch": 0.029340147301969037, "flos": 20660234590080.0, "grad_norm": 2.566388301054309, "language_loss": 0.88581878, "learning_rate": 3.985648090637122e-06, "loss": 0.90934181, "num_input_tokens_seen": 10380910, "step": 488, "time_per_iteration": 2.6569244861602783 }, { "auxiliary_loss_clip": 0.01275613, "auxiliary_loss_mlp": 0.01081415, "balance_loss_clip": 1.07419777, "balance_loss_mlp": 1.04667735, "epoch": 0.029400270554637006, "flos": 24428018689920.0, "grad_norm": 2.0135021623582503, "language_loss": 0.88869834, "learning_rate": 3.986966109896785e-06, "loss": 0.91226858, "num_input_tokens_seen": 10400665, "step": 489, "time_per_iteration": 2.805555582046509 }, { "auxiliary_loss_clip": 0.01271096, "auxiliary_loss_mlp": 0.01077182, "balance_loss_clip": 1.0704807, "balance_loss_mlp": 1.04168141, "epoch": 0.029460393807304974, "flos": 20120892900480.0, "grad_norm": 2.807428314395572, "language_loss": 0.88554472, "learning_rate": 3.988281436571815e-06, "loss": 0.90902752, "num_input_tokens_seen": 10420150, "step": 490, "time_per_iteration": 2.612993001937866 }, { "auxiliary_loss_clip": 0.01276687, "auxiliary_loss_mlp": 0.01088031, "balance_loss_clip": 1.0729506, "balance_loss_mlp": 1.0536747, "epoch": 0.029520517059972943, "flos": 17675914965120.0, "grad_norm": 2.430337539839543, "language_loss": 0.91496718, "learning_rate": 3.989594081641164e-06, "loss": 0.93861437, "num_input_tokens_seen": 10438210, "step": 491, "time_per_iteration": 2.6203627586364746 }, { "auxiliary_loss_clip": 0.01266864, "auxiliary_loss_mlp": 0.01072939, "balance_loss_clip": 1.07131863, "balance_loss_mlp": 1.03984618, "epoch": 0.029580640312640915, "flos": 18953185662720.0, "grad_norm": 1.9753258841331502, "language_loss": 0.85654163, "learning_rate": 3.9909040560167675e-06, "loss": 0.87993968, "num_input_tokens_seen": 10455125, "step": 492, "time_per_iteration": 2.636378288269043 }, { "auxiliary_loss_clip": 0.01279009, "auxiliary_loss_mlp": 0.01100381, "balance_loss_clip": 1.07765996, "balance_loss_mlp": 1.06471384, "epoch": 0.029640763565308884, "flos": 18726121837440.0, "grad_norm": 4.076790847855052, "language_loss": 0.84615922, "learning_rate": 3.992211370544093e-06, "loss": 0.86995316, "num_input_tokens_seen": 10470990, "step": 493, "time_per_iteration": 2.6144914627075195 }, { "auxiliary_loss_clip": 0.01272514, "auxiliary_loss_mlp": 0.01074657, "balance_loss_clip": 1.07140934, "balance_loss_mlp": 1.04042029, "epoch": 0.029700886817976852, "flos": 20595308757120.0, "grad_norm": 1.8084917907335818, "language_loss": 0.8658669, "learning_rate": 3.99351603600268e-06, "loss": 0.88933873, "num_input_tokens_seen": 10490685, "step": 494, "time_per_iteration": 2.7063095569610596 }, { "auxiliary_loss_clip": 0.01281688, "auxiliary_loss_mlp": 0.01084428, "balance_loss_clip": 1.07739305, "balance_loss_mlp": 1.05279028, "epoch": 0.02976101007064482, "flos": 22236857233920.0, "grad_norm": 7.125038043922513, "language_loss": 0.86841047, "learning_rate": 3.994818063106668e-06, "loss": 0.8920716, "num_input_tokens_seen": 10509435, "step": 495, "time_per_iteration": 2.641700267791748 }, { "auxiliary_loss_clip": 0.01268945, "auxiliary_loss_mlp": 0.01078198, "balance_loss_clip": 1.07384837, "balance_loss_mlp": 1.04508162, "epoch": 0.029821133323312793, "flos": 23732644320000.0, "grad_norm": 2.201071528053665, "language_loss": 0.61988759, "learning_rate": 3.99611746250533e-06, "loss": 0.64335901, "num_input_tokens_seen": 10530050, "step": 496, "time_per_iteration": 2.6524407863616943 }, { "auxiliary_loss_clip": 0.01270994, "auxiliary_loss_mlp": 0.01089922, "balance_loss_clip": 1.07575428, "balance_loss_mlp": 1.05680561, "epoch": 0.02988125657598076, "flos": 22419498913920.0, "grad_norm": 1.7538974268426115, "language_loss": 0.88820887, "learning_rate": 3.997414244783595e-06, "loss": 0.91181797, "num_input_tokens_seen": 10551370, "step": 497, "time_per_iteration": 5.648245811462402 }, { "auxiliary_loss_clip": 0.01277289, "auxiliary_loss_mlp": 0.01079642, "balance_loss_clip": 1.07670021, "balance_loss_mlp": 1.04604888, "epoch": 0.02994137982864873, "flos": 13845108453120.0, "grad_norm": 2.8395997319333204, "language_loss": 0.85091698, "learning_rate": 3.998708420462557e-06, "loss": 0.87448633, "num_input_tokens_seen": 10569225, "step": 498, "time_per_iteration": 4.362173080444336 }, { "auxiliary_loss_clip": 0.0127249, "auxiliary_loss_mlp": 0.01078673, "balance_loss_clip": 1.07436109, "balance_loss_mlp": 1.04691589, "epoch": 0.0300015030813167, "flos": 23908354675200.0, "grad_norm": 3.2275044857926605, "language_loss": 0.77883017, "learning_rate": 4e-06, "loss": 0.80234182, "num_input_tokens_seen": 10586170, "step": 499, "time_per_iteration": 2.6029655933380127 }, { "auxiliary_loss_clip": 0.01272525, "auxiliary_loss_mlp": 0.01082339, "balance_loss_clip": 1.07433248, "balance_loss_mlp": 1.04905546, "epoch": 0.030061626333984667, "flos": 22016796560640.0, "grad_norm": 2.244229511477372, "language_loss": 0.82687509, "learning_rate": 3.9999999620799e-06, "loss": 0.85042375, "num_input_tokens_seen": 10606205, "step": 500, "time_per_iteration": 2.6293113231658936 }, { "auxiliary_loss_clip": 0.01266453, "auxiliary_loss_mlp": 0.0108458, "balance_loss_clip": 1.07100737, "balance_loss_mlp": 1.04922247, "epoch": 0.03012174958665264, "flos": 23039747988480.0, "grad_norm": 3.2569274145363356, "language_loss": 0.88086087, "learning_rate": 3.9999998483196e-06, "loss": 0.90437114, "num_input_tokens_seen": 10625995, "step": 501, "time_per_iteration": 2.601081132888794 }, { "auxiliary_loss_clip": 0.01273997, "auxiliary_loss_mlp": 0.01071746, "balance_loss_clip": 1.07361674, "balance_loss_mlp": 1.04025102, "epoch": 0.030181872839320608, "flos": 18953257489920.0, "grad_norm": 3.3627001763511855, "language_loss": 0.86654103, "learning_rate": 3.9999996587191065e-06, "loss": 0.88999844, "num_input_tokens_seen": 10644105, "step": 502, "time_per_iteration": 2.5507659912109375 }, { "auxiliary_loss_clip": 0.01270542, "auxiliary_loss_mlp": 0.01081534, "balance_loss_clip": 1.07475543, "balance_loss_mlp": 1.04827452, "epoch": 0.030241996091988577, "flos": 16728017005440.0, "grad_norm": 2.4572357458963876, "language_loss": 0.84281206, "learning_rate": 3.999999393278425e-06, "loss": 0.86633277, "num_input_tokens_seen": 10661090, "step": 503, "time_per_iteration": 2.618587017059326 }, { "auxiliary_loss_clip": 0.01262547, "auxiliary_loss_mlp": 0.01091143, "balance_loss_clip": 1.0710721, "balance_loss_mlp": 1.05781209, "epoch": 0.030302119344656545, "flos": 28621271387520.0, "grad_norm": 1.6994359255159197, "language_loss": 0.88137805, "learning_rate": 3.999999051997567e-06, "loss": 0.90491492, "num_input_tokens_seen": 10682380, "step": 504, "time_per_iteration": 2.6794183254241943 }, { "auxiliary_loss_clip": 0.01264601, "auxiliary_loss_mlp": 0.01086749, "balance_loss_clip": 1.07040262, "balance_loss_mlp": 1.0541091, "epoch": 0.030362242597324514, "flos": 15669334523520.0, "grad_norm": 2.074855698516145, "language_loss": 0.786093, "learning_rate": 3.9999986348765425e-06, "loss": 0.80960649, "num_input_tokens_seen": 10699925, "step": 505, "time_per_iteration": 2.564960479736328 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.010147, "balance_loss_clip": 1.03763247, "balance_loss_mlp": 1.00692737, "epoch": 0.030422365849992486, "flos": 72125973676800.0, "grad_norm": 0.9565689962416369, "language_loss": 0.54981297, "learning_rate": 3.999998141915371e-06, "loss": 0.57130682, "num_input_tokens_seen": 10766525, "step": 506, "time_per_iteration": 3.3345654010772705 }, { "auxiliary_loss_clip": 0.01266577, "auxiliary_loss_mlp": 0.01090299, "balance_loss_clip": 1.07119894, "balance_loss_mlp": 1.05687308, "epoch": 0.030482489102660455, "flos": 19427817000960.0, "grad_norm": 2.2738865373146684, "language_loss": 0.83377159, "learning_rate": 3.999997573114069e-06, "loss": 0.8573404, "num_input_tokens_seen": 10786725, "step": 507, "time_per_iteration": 2.645613670349121 }, { "auxiliary_loss_clip": 0.01269938, "auxiliary_loss_mlp": 0.01076205, "balance_loss_clip": 1.07151937, "balance_loss_mlp": 1.04344678, "epoch": 0.030542612355328423, "flos": 20375822701440.0, "grad_norm": 2.375369924968869, "language_loss": 0.88842839, "learning_rate": 3.999996928472659e-06, "loss": 0.91188985, "num_input_tokens_seen": 10805390, "step": 508, "time_per_iteration": 2.617283344268799 }, { "auxiliary_loss_clip": 0.01272148, "auxiliary_loss_mlp": 0.01067206, "balance_loss_clip": 1.07232118, "balance_loss_mlp": 1.03394616, "epoch": 0.030602735607996392, "flos": 34677354297600.0, "grad_norm": 6.964954749829821, "language_loss": 0.71807706, "learning_rate": 3.999996207991165e-06, "loss": 0.74147063, "num_input_tokens_seen": 10828030, "step": 509, "time_per_iteration": 2.7723498344421387 }, { "auxiliary_loss_clip": 0.01264594, "auxiliary_loss_mlp": 0.01074377, "balance_loss_clip": 1.07241154, "balance_loss_mlp": 1.04333544, "epoch": 0.03066285886066436, "flos": 23658668259840.0, "grad_norm": 1.9285974370038053, "language_loss": 0.82031929, "learning_rate": 3.999995411669614e-06, "loss": 0.84370899, "num_input_tokens_seen": 10845240, "step": 510, "time_per_iteration": 2.6254217624664307 }, { "auxiliary_loss_clip": 0.01268793, "auxiliary_loss_mlp": 0.01075379, "balance_loss_clip": 1.07532823, "balance_loss_mlp": 1.04252458, "epoch": 0.030722982113332332, "flos": 23002975440000.0, "grad_norm": 5.706057095430757, "language_loss": 0.83572316, "learning_rate": 3.999994539508036e-06, "loss": 0.85916495, "num_input_tokens_seen": 10864325, "step": 511, "time_per_iteration": 2.613457441329956 }, { "auxiliary_loss_clip": 0.01269742, "auxiliary_loss_mlp": 0.01081314, "balance_loss_clip": 1.07207167, "balance_loss_mlp": 1.0496521, "epoch": 0.0307831053660003, "flos": 24750855152640.0, "grad_norm": 2.025270681093948, "language_loss": 0.82109964, "learning_rate": 3.9999935915064655e-06, "loss": 0.84461015, "num_input_tokens_seen": 10883860, "step": 512, "time_per_iteration": 2.630404233932495 }, { "auxiliary_loss_clip": 0.01266054, "auxiliary_loss_mlp": 0.01084436, "balance_loss_clip": 1.07086158, "balance_loss_mlp": 1.05070007, "epoch": 0.03084322861866827, "flos": 26140885620480.0, "grad_norm": 2.500363981205655, "language_loss": 0.86933553, "learning_rate": 3.9999925676649374e-06, "loss": 0.89284045, "num_input_tokens_seen": 10904555, "step": 513, "time_per_iteration": 2.671926259994507 }, { "auxiliary_loss_clip": 0.01272542, "auxiliary_loss_mlp": 0.01080065, "balance_loss_clip": 1.07461214, "balance_loss_mlp": 1.04744935, "epoch": 0.03090335187133624, "flos": 18771298168320.0, "grad_norm": 1.704575426690477, "language_loss": 0.79124331, "learning_rate": 3.999991467983491e-06, "loss": 0.81476939, "num_input_tokens_seen": 10923700, "step": 514, "time_per_iteration": 2.6158573627471924 }, { "auxiliary_loss_clip": 0.01265821, "auxiliary_loss_mlp": 0.01067844, "balance_loss_clip": 1.07397485, "balance_loss_mlp": 1.03711247, "epoch": 0.030963475124004207, "flos": 23221886878080.0, "grad_norm": 2.729063628201222, "language_loss": 0.77758944, "learning_rate": 3.999990292462167e-06, "loss": 0.80092615, "num_input_tokens_seen": 10942730, "step": 515, "time_per_iteration": 2.636294364929199 }, { "auxiliary_loss_clip": 0.0126398, "auxiliary_loss_mlp": 0.01072575, "balance_loss_clip": 1.06835747, "balance_loss_mlp": 1.03874326, "epoch": 0.03102359837667218, "flos": 42525595411200.0, "grad_norm": 2.1228851207681503, "language_loss": 0.82452714, "learning_rate": 3.999989041101011e-06, "loss": 0.84789264, "num_input_tokens_seen": 10967120, "step": 516, "time_per_iteration": 2.8078057765960693 }, { "auxiliary_loss_clip": 0.01263726, "auxiliary_loss_mlp": 0.01073859, "balance_loss_clip": 1.0712111, "balance_loss_mlp": 1.04090929, "epoch": 0.031083721629340148, "flos": 21176953689600.0, "grad_norm": 1.9016724574566626, "language_loss": 0.79088318, "learning_rate": 3.999987713900071e-06, "loss": 0.81425899, "num_input_tokens_seen": 10986775, "step": 517, "time_per_iteration": 2.5935981273651123 }, { "auxiliary_loss_clip": 0.0125895, "auxiliary_loss_mlp": 0.0107836, "balance_loss_clip": 1.07049131, "balance_loss_mlp": 1.04629326, "epoch": 0.031143844882008116, "flos": 29716187713920.0, "grad_norm": 1.6829619528007147, "language_loss": 0.90798068, "learning_rate": 3.999986310859396e-06, "loss": 0.93135381, "num_input_tokens_seen": 11011360, "step": 518, "time_per_iteration": 2.6855509281158447 }, { "auxiliary_loss_clip": 0.01272237, "auxiliary_loss_mlp": 0.01097567, "balance_loss_clip": 1.07848859, "balance_loss_mlp": 1.06230497, "epoch": 0.031203968134676085, "flos": 23112467072640.0, "grad_norm": 1.8835331125391583, "language_loss": 0.86759162, "learning_rate": 3.999984831979039e-06, "loss": 0.89128959, "num_input_tokens_seen": 11030150, "step": 519, "time_per_iteration": 2.628380060195923 }, { "auxiliary_loss_clip": 0.01265864, "auxiliary_loss_mlp": 0.01086943, "balance_loss_clip": 1.06901193, "balance_loss_mlp": 1.05578136, "epoch": 0.03126409138734405, "flos": 20954379064320.0, "grad_norm": 3.8823628482318164, "language_loss": 0.87246573, "learning_rate": 3.999983277259057e-06, "loss": 0.89599377, "num_input_tokens_seen": 11049145, "step": 520, "time_per_iteration": 2.5850255489349365 }, { "auxiliary_loss_clip": 0.01269157, "auxiliary_loss_mlp": 0.01086266, "balance_loss_clip": 1.07231963, "balance_loss_mlp": 1.0528394, "epoch": 0.031324214640012026, "flos": 21650112570240.0, "grad_norm": 1.7050130216714323, "language_loss": 0.89274424, "learning_rate": 3.999981646699509e-06, "loss": 0.91629851, "num_input_tokens_seen": 11068835, "step": 521, "time_per_iteration": 2.6412506103515625 }, { "auxiliary_loss_clip": 0.01263772, "auxiliary_loss_mlp": 0.01082584, "balance_loss_clip": 1.0717473, "balance_loss_mlp": 1.04827595, "epoch": 0.03138433789267999, "flos": 23441337020160.0, "grad_norm": 2.085624200373119, "language_loss": 0.71452564, "learning_rate": 3.999979940300456e-06, "loss": 0.73798925, "num_input_tokens_seen": 11088980, "step": 522, "time_per_iteration": 2.6561174392700195 }, { "auxiliary_loss_clip": 0.01265725, "auxiliary_loss_mlp": 0.01082552, "balance_loss_clip": 1.06871116, "balance_loss_mlp": 1.05079484, "epoch": 0.03144446114534796, "flos": 18982164960000.0, "grad_norm": 4.223323698032832, "language_loss": 0.84758592, "learning_rate": 3.999978158061963e-06, "loss": 0.87106872, "num_input_tokens_seen": 11104300, "step": 523, "time_per_iteration": 2.608565330505371 }, { "auxiliary_loss_clip": 0.01271589, "auxiliary_loss_mlp": 0.01076253, "balance_loss_clip": 1.07193565, "balance_loss_mlp": 1.04296994, "epoch": 0.031504584398015935, "flos": 22637692080000.0, "grad_norm": 2.324094801199308, "language_loss": 0.89989722, "learning_rate": 3.999976299984099e-06, "loss": 0.92337573, "num_input_tokens_seen": 11123335, "step": 524, "time_per_iteration": 2.68269944190979 }, { "auxiliary_loss_clip": 0.01273471, "auxiliary_loss_mlp": 0.0108318, "balance_loss_clip": 1.07427168, "balance_loss_mlp": 1.04944324, "epoch": 0.0315647076506839, "flos": 25297056339840.0, "grad_norm": 2.4635323942475766, "language_loss": 0.80114233, "learning_rate": 3.999974366066933e-06, "loss": 0.82470882, "num_input_tokens_seen": 11140880, "step": 525, "time_per_iteration": 2.6396324634552 }, { "auxiliary_loss_clip": 0.01264716, "auxiliary_loss_mlp": 0.01080959, "balance_loss_clip": 1.0681529, "balance_loss_mlp": 1.04798603, "epoch": 0.03162483090335187, "flos": 16982839065600.0, "grad_norm": 2.3553733144031948, "language_loss": 0.81162, "learning_rate": 3.999972356310538e-06, "loss": 0.83507675, "num_input_tokens_seen": 11158710, "step": 526, "time_per_iteration": 2.6167168617248535 }, { "auxiliary_loss_clip": 0.01273987, "auxiliary_loss_mlp": 0.01072725, "balance_loss_clip": 1.07507181, "balance_loss_mlp": 1.03736734, "epoch": 0.03168495415601984, "flos": 18734489706240.0, "grad_norm": 1.9666844995001491, "language_loss": 0.81491739, "learning_rate": 3.999970270714991e-06, "loss": 0.83838451, "num_input_tokens_seen": 11177550, "step": 527, "time_per_iteration": 2.580310821533203 }, { "auxiliary_loss_clip": 0.01261155, "auxiliary_loss_mlp": 0.01080842, "balance_loss_clip": 1.06786597, "balance_loss_mlp": 1.04717755, "epoch": 0.03174507740868781, "flos": 21214875473280.0, "grad_norm": 1.9105688869262756, "language_loss": 0.93801636, "learning_rate": 3.999968109280371e-06, "loss": 0.96143627, "num_input_tokens_seen": 11196230, "step": 528, "time_per_iteration": 2.5901002883911133 }, { "auxiliary_loss_clip": 0.01263275, "auxiliary_loss_mlp": 0.01071724, "balance_loss_clip": 1.06776333, "balance_loss_mlp": 1.0387274, "epoch": 0.03180520066135578, "flos": 24787663614720.0, "grad_norm": 1.8924176613796981, "language_loss": 0.84130204, "learning_rate": 3.99996587200676e-06, "loss": 0.86465204, "num_input_tokens_seen": 11214935, "step": 529, "time_per_iteration": 2.593867063522339 }, { "auxiliary_loss_clip": 0.01266309, "auxiliary_loss_mlp": 0.01088988, "balance_loss_clip": 1.07501197, "balance_loss_mlp": 1.0563724, "epoch": 0.03186532391402375, "flos": 24864261367680.0, "grad_norm": 2.316883777672742, "language_loss": 0.90458709, "learning_rate": 3.999963558894243e-06, "loss": 0.92814004, "num_input_tokens_seen": 11235310, "step": 530, "time_per_iteration": 2.5994982719421387 }, { "auxiliary_loss_clip": 0.01261024, "auxiliary_loss_mlp": 0.0107627, "balance_loss_clip": 1.06481552, "balance_loss_mlp": 1.04188991, "epoch": 0.03192544716669172, "flos": 21215055041280.0, "grad_norm": 2.2744046769674324, "language_loss": 0.76334512, "learning_rate": 3.999961169942907e-06, "loss": 0.78671807, "num_input_tokens_seen": 11254425, "step": 531, "time_per_iteration": 2.618149757385254 }, { "auxiliary_loss_clip": 0.01260981, "auxiliary_loss_mlp": 0.01064937, "balance_loss_clip": 1.0669558, "balance_loss_mlp": 1.03143883, "epoch": 0.03198557041935969, "flos": 24353216616960.0, "grad_norm": 2.467757262816931, "language_loss": 0.90483695, "learning_rate": 3.999958705152843e-06, "loss": 0.92809618, "num_input_tokens_seen": 11274595, "step": 532, "time_per_iteration": 2.647947072982788 }, { "auxiliary_loss_clip": 0.01146464, "auxiliary_loss_mlp": 0.01012028, "balance_loss_clip": 1.04988623, "balance_loss_mlp": 1.00325394, "epoch": 0.032045693672027656, "flos": 61827367587840.0, "grad_norm": 1.9655071928838626, "language_loss": 0.57953775, "learning_rate": 3.9999561645241445e-06, "loss": 0.60112268, "num_input_tokens_seen": 11336705, "step": 533, "time_per_iteration": 3.2502808570861816 }, { "auxiliary_loss_clip": 0.01260941, "auxiliary_loss_mlp": 0.01084263, "balance_loss_clip": 1.06724441, "balance_loss_mlp": 1.0516715, "epoch": 0.03210581692469563, "flos": 28401174800640.0, "grad_norm": 1.7138682169725878, "language_loss": 0.86666048, "learning_rate": 3.999953548056907e-06, "loss": 0.89011252, "num_input_tokens_seen": 11356820, "step": 534, "time_per_iteration": 2.678739070892334 }, { "auxiliary_loss_clip": 0.01259554, "auxiliary_loss_mlp": 0.01066669, "balance_loss_clip": 1.06782031, "balance_loss_mlp": 1.03407741, "epoch": 0.03216594017736359, "flos": 24717709877760.0, "grad_norm": 2.12774196295415, "language_loss": 0.77627808, "learning_rate": 3.999950855751232e-06, "loss": 0.79954034, "num_input_tokens_seen": 11376645, "step": 535, "time_per_iteration": 2.7128217220306396 }, { "auxiliary_loss_clip": 0.01261708, "auxiliary_loss_mlp": 0.01081378, "balance_loss_clip": 1.06843078, "balance_loss_mlp": 1.0485003, "epoch": 0.032226063430031565, "flos": 31175453646720.0, "grad_norm": 3.9913279940153585, "language_loss": 0.80939913, "learning_rate": 3.999948087607219e-06, "loss": 0.83283001, "num_input_tokens_seen": 11397310, "step": 536, "time_per_iteration": 2.7490127086639404 }, { "auxiliary_loss_clip": 0.01262237, "auxiliary_loss_mlp": 0.01075987, "balance_loss_clip": 1.06839073, "balance_loss_mlp": 1.04167831, "epoch": 0.03228618668269954, "flos": 32198225506560.0, "grad_norm": 1.6888601787189168, "language_loss": 0.7009111, "learning_rate": 3.999945243624975e-06, "loss": 0.72429335, "num_input_tokens_seen": 11418475, "step": 537, "time_per_iteration": 5.5609166622161865 }, { "auxiliary_loss_clip": 0.0126357, "auxiliary_loss_mlp": 0.01084205, "balance_loss_clip": 1.07331729, "balance_loss_mlp": 1.05161297, "epoch": 0.0323463099353675, "flos": 22670154996480.0, "grad_norm": 2.146306428033486, "language_loss": 0.82684958, "learning_rate": 3.999942323804607e-06, "loss": 0.85032725, "num_input_tokens_seen": 11436630, "step": 538, "time_per_iteration": 2.5465030670166016 }, { "auxiliary_loss_clip": 0.01269537, "auxiliary_loss_mlp": 0.01078099, "balance_loss_clip": 1.06987572, "balance_loss_mlp": 1.04536414, "epoch": 0.032406433188035474, "flos": 26905172232960.0, "grad_norm": 1.8709064214989917, "language_loss": 0.79146457, "learning_rate": 3.999939328146225e-06, "loss": 0.81494099, "num_input_tokens_seen": 11457275, "step": 539, "time_per_iteration": 4.172123432159424 }, { "auxiliary_loss_clip": 0.0126143, "auxiliary_loss_mlp": 0.01069528, "balance_loss_clip": 1.06830835, "balance_loss_mlp": 1.03567231, "epoch": 0.03246655644070344, "flos": 31503928544640.0, "grad_norm": 35.59051030008172, "language_loss": 0.77379727, "learning_rate": 3.999936256649943e-06, "loss": 0.79710686, "num_input_tokens_seen": 11476925, "step": 540, "time_per_iteration": 2.5633046627044678 }, { "auxiliary_loss_clip": 0.01269863, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.07271969, "balance_loss_mlp": 1.04124355, "epoch": 0.03252667969337141, "flos": 23218331431680.0, "grad_norm": 2.0489065110302636, "language_loss": 0.85458571, "learning_rate": 3.999933109315878e-06, "loss": 0.878021, "num_input_tokens_seen": 11496830, "step": 541, "time_per_iteration": 2.6079938411712646 }, { "auxiliary_loss_clip": 0.01258504, "auxiliary_loss_mlp": 0.01082451, "balance_loss_clip": 1.06961954, "balance_loss_mlp": 1.04835749, "epoch": 0.032586802946039384, "flos": 14757454926720.0, "grad_norm": 2.674731240129174, "language_loss": 0.89234567, "learning_rate": 3.9999298861441496e-06, "loss": 0.91575521, "num_input_tokens_seen": 11515605, "step": 542, "time_per_iteration": 2.597036600112915 }, { "auxiliary_loss_clip": 0.0126351, "auxiliary_loss_mlp": 0.01081041, "balance_loss_clip": 1.06974792, "balance_loss_mlp": 1.04792452, "epoch": 0.03264692619870735, "flos": 24280677100800.0, "grad_norm": 2.2714121360014334, "language_loss": 0.71123677, "learning_rate": 3.999926587134879e-06, "loss": 0.73468232, "num_input_tokens_seen": 11536230, "step": 543, "time_per_iteration": 2.634601354598999 }, { "auxiliary_loss_clip": 0.01259994, "auxiliary_loss_mlp": 0.01088763, "balance_loss_clip": 1.06379187, "balance_loss_mlp": 1.05545604, "epoch": 0.03270704945137532, "flos": 22893160584960.0, "grad_norm": 4.777521083182084, "language_loss": 0.91540575, "learning_rate": 3.999923212288192e-06, "loss": 0.93889332, "num_input_tokens_seen": 11554715, "step": 544, "time_per_iteration": 2.6173009872436523 }, { "auxiliary_loss_clip": 0.01264485, "auxiliary_loss_mlp": 0.01085684, "balance_loss_clip": 1.06989884, "balance_loss_mlp": 1.05571437, "epoch": 0.032767172704043286, "flos": 18041018757120.0, "grad_norm": 2.6951315012120025, "language_loss": 0.65799558, "learning_rate": 3.999919761604216e-06, "loss": 0.68149722, "num_input_tokens_seen": 11571370, "step": 545, "time_per_iteration": 2.6500988006591797 }, { "auxiliary_loss_clip": 0.012623, "auxiliary_loss_mlp": 0.0107161, "balance_loss_clip": 1.06693912, "balance_loss_mlp": 1.0393517, "epoch": 0.03282729595671126, "flos": 22528739151360.0, "grad_norm": 2.2564766449723908, "language_loss": 0.92221987, "learning_rate": 3.999916235083083e-06, "loss": 0.94555902, "num_input_tokens_seen": 11588560, "step": 546, "time_per_iteration": 2.673250913619995 }, { "auxiliary_loss_clip": 0.01260258, "auxiliary_loss_mlp": 0.01077296, "balance_loss_clip": 1.06488204, "balance_loss_mlp": 1.04313052, "epoch": 0.03288741920937923, "flos": 20410620001920.0, "grad_norm": 2.1923718908590653, "language_loss": 0.81706661, "learning_rate": 3.999912632724925e-06, "loss": 0.84044212, "num_input_tokens_seen": 11605685, "step": 547, "time_per_iteration": 2.725198745727539 }, { "auxiliary_loss_clip": 0.0126227, "auxiliary_loss_mlp": 0.0107871, "balance_loss_clip": 1.06794477, "balance_loss_mlp": 1.04480648, "epoch": 0.032947542462047195, "flos": 20777986350720.0, "grad_norm": 1.730652582963277, "language_loss": 0.81227565, "learning_rate": 3.999908954529881e-06, "loss": 0.83568549, "num_input_tokens_seen": 11626290, "step": 548, "time_per_iteration": 2.714073419570923 }, { "auxiliary_loss_clip": 0.01264818, "auxiliary_loss_mlp": 0.01084154, "balance_loss_clip": 1.06963027, "balance_loss_mlp": 1.04870164, "epoch": 0.03300766571471517, "flos": 19901263190400.0, "grad_norm": 3.8540092911047603, "language_loss": 0.67460287, "learning_rate": 3.999905200498087e-06, "loss": 0.69809258, "num_input_tokens_seen": 11643950, "step": 549, "time_per_iteration": 2.6747171878814697 }, { "auxiliary_loss_clip": 0.0125805, "auxiliary_loss_mlp": 0.01076001, "balance_loss_clip": 1.06968856, "balance_loss_mlp": 1.04236054, "epoch": 0.03306778896738313, "flos": 17967760968960.0, "grad_norm": 1.933615596136007, "language_loss": 0.86379111, "learning_rate": 3.999901370629689e-06, "loss": 0.88713157, "num_input_tokens_seen": 11662560, "step": 550, "time_per_iteration": 2.553386926651001 }, { "auxiliary_loss_clip": 0.01264951, "auxiliary_loss_mlp": 0.01095377, "balance_loss_clip": 1.07279766, "balance_loss_mlp": 1.06142652, "epoch": 0.033127912220051105, "flos": 21653380707840.0, "grad_norm": 3.1958143211070977, "language_loss": 0.8127178, "learning_rate": 3.99989746492483e-06, "loss": 0.83632112, "num_input_tokens_seen": 11682265, "step": 551, "time_per_iteration": 2.6231682300567627 }, { "auxiliary_loss_clip": 0.01271579, "auxiliary_loss_mlp": 0.0108998, "balance_loss_clip": 1.07285261, "balance_loss_mlp": 1.05626702, "epoch": 0.03318803547271908, "flos": 30188376927360.0, "grad_norm": 2.9473143774727606, "language_loss": 0.86134821, "learning_rate": 3.999893483383658e-06, "loss": 0.88496381, "num_input_tokens_seen": 11699300, "step": 552, "time_per_iteration": 2.7002694606781006 }, { "auxiliary_loss_clip": 0.01267081, "auxiliary_loss_mlp": 0.01081671, "balance_loss_clip": 1.07191086, "balance_loss_mlp": 1.04650474, "epoch": 0.03324815872538704, "flos": 20376038183040.0, "grad_norm": 2.990469903058063, "language_loss": 0.9301765, "learning_rate": 3.999889426006326e-06, "loss": 0.95366406, "num_input_tokens_seen": 11716955, "step": 553, "time_per_iteration": 2.6629648208618164 }, { "auxiliary_loss_clip": 0.01262345, "auxiliary_loss_mlp": 0.01077186, "balance_loss_clip": 1.06925786, "balance_loss_mlp": 1.04149485, "epoch": 0.033308281978055014, "flos": 24494560634880.0, "grad_norm": 2.1924330874053166, "language_loss": 0.78881586, "learning_rate": 3.999885292792986e-06, "loss": 0.8122111, "num_input_tokens_seen": 11736130, "step": 554, "time_per_iteration": 2.668970823287964 }, { "auxiliary_loss_clip": 0.01258048, "auxiliary_loss_mlp": 0.0108557, "balance_loss_clip": 1.06745815, "balance_loss_mlp": 1.05045104, "epoch": 0.03336840523072298, "flos": 23400326666880.0, "grad_norm": 2.2144550089326938, "language_loss": 0.81971425, "learning_rate": 3.999881083743795e-06, "loss": 0.84315038, "num_input_tokens_seen": 11754425, "step": 555, "time_per_iteration": 2.610807418823242 }, { "auxiliary_loss_clip": 0.01264442, "auxiliary_loss_mlp": 0.0108339, "balance_loss_clip": 1.06914032, "balance_loss_mlp": 1.04805672, "epoch": 0.03342852848339095, "flos": 30550571717760.0, "grad_norm": 3.7821745066525487, "language_loss": 0.88661897, "learning_rate": 3.999876798858914e-06, "loss": 0.9100973, "num_input_tokens_seen": 11772845, "step": 556, "time_per_iteration": 2.6288907527923584 }, { "auxiliary_loss_clip": 0.01262553, "auxiliary_loss_mlp": 0.01084158, "balance_loss_clip": 1.06896496, "balance_loss_mlp": 1.04863358, "epoch": 0.03348865173605892, "flos": 22893304239360.0, "grad_norm": 1.974910128087634, "language_loss": 0.83708388, "learning_rate": 3.999872438138503e-06, "loss": 0.860551, "num_input_tokens_seen": 11792850, "step": 557, "time_per_iteration": 2.649401903152466 }, { "auxiliary_loss_clip": 0.01268198, "auxiliary_loss_mlp": 0.01069057, "balance_loss_clip": 1.07400489, "balance_loss_mlp": 1.03684711, "epoch": 0.03354877498872689, "flos": 17676022705920.0, "grad_norm": 3.176542206824637, "language_loss": 0.94202292, "learning_rate": 3.999868001582729e-06, "loss": 0.96539545, "num_input_tokens_seen": 11809670, "step": 558, "time_per_iteration": 2.550515651702881 }, { "auxiliary_loss_clip": 0.01258948, "auxiliary_loss_mlp": 0.01074291, "balance_loss_clip": 1.06591845, "balance_loss_mlp": 1.04036427, "epoch": 0.03360889824139486, "flos": 21652985658240.0, "grad_norm": 2.6619487077732384, "language_loss": 0.77115649, "learning_rate": 3.99986348919176e-06, "loss": 0.79448891, "num_input_tokens_seen": 11829665, "step": 559, "time_per_iteration": 2.729597330093384 }, { "auxiliary_loss_clip": 0.01261947, "auxiliary_loss_mlp": 0.01080822, "balance_loss_clip": 1.06835234, "balance_loss_mlp": 1.04882574, "epoch": 0.033669021494062826, "flos": 21795730306560.0, "grad_norm": 1.945022837871561, "language_loss": 0.87472397, "learning_rate": 3.9998589009657675e-06, "loss": 0.89815164, "num_input_tokens_seen": 11848190, "step": 560, "time_per_iteration": 2.6082279682159424 }, { "auxiliary_loss_clip": 0.01257198, "auxiliary_loss_mlp": 0.0107356, "balance_loss_clip": 1.06704283, "balance_loss_mlp": 1.04199314, "epoch": 0.0337291447467308, "flos": 21866222747520.0, "grad_norm": 2.4061219554407502, "language_loss": 0.81578708, "learning_rate": 3.999854236904925e-06, "loss": 0.83909464, "num_input_tokens_seen": 11864795, "step": 561, "time_per_iteration": 2.602193832397461 }, { "auxiliary_loss_clip": 0.01254722, "auxiliary_loss_mlp": 0.01076361, "balance_loss_clip": 1.06685936, "balance_loss_mlp": 1.04422247, "epoch": 0.03378926799939877, "flos": 24245951627520.0, "grad_norm": 1.683217504050761, "language_loss": 0.82320511, "learning_rate": 3.999849497009409e-06, "loss": 0.84651601, "num_input_tokens_seen": 11885275, "step": 562, "time_per_iteration": 2.675872564315796 }, { "auxiliary_loss_clip": 0.01262146, "auxiliary_loss_mlp": 0.01084212, "balance_loss_clip": 1.06894755, "balance_loss_mlp": 1.0508337, "epoch": 0.033849391252066735, "flos": 16507812677760.0, "grad_norm": 2.262509698135982, "language_loss": 0.84285647, "learning_rate": 3.999844681279401e-06, "loss": 0.86632001, "num_input_tokens_seen": 11903595, "step": 563, "time_per_iteration": 2.586944103240967 }, { "auxiliary_loss_clip": 0.01258135, "auxiliary_loss_mlp": 0.01083866, "balance_loss_clip": 1.0675565, "balance_loss_mlp": 1.05094075, "epoch": 0.03390951450473471, "flos": 15669298609920.0, "grad_norm": 2.115200912185494, "language_loss": 0.94438875, "learning_rate": 3.99983978971508e-06, "loss": 0.96780878, "num_input_tokens_seen": 11917815, "step": 564, "time_per_iteration": 2.5444440841674805 }, { "auxiliary_loss_clip": 0.01259509, "auxiliary_loss_mlp": 0.01073406, "balance_loss_clip": 1.06518865, "balance_loss_mlp": 1.03907406, "epoch": 0.03396963775740267, "flos": 22674787850880.0, "grad_norm": 2.6560391741906924, "language_loss": 0.94669235, "learning_rate": 3.999834822316635e-06, "loss": 0.97002149, "num_input_tokens_seen": 11936305, "step": 565, "time_per_iteration": 2.5614171028137207 }, { "auxiliary_loss_clip": 0.01150452, "auxiliary_loss_mlp": 0.01081579, "balance_loss_clip": 1.04835606, "balance_loss_mlp": 1.07499874, "epoch": 0.034029761010070644, "flos": 64392683063040.0, "grad_norm": 1.0610477485673708, "language_loss": 0.54800498, "learning_rate": 3.9998297790842535e-06, "loss": 0.57032537, "num_input_tokens_seen": 11998940, "step": 566, "time_per_iteration": 3.229137659072876 }, { "auxiliary_loss_clip": 0.0126129, "auxiliary_loss_mlp": 0.01073482, "balance_loss_clip": 1.06798041, "balance_loss_mlp": 1.03793335, "epoch": 0.034089884262738616, "flos": 25004204755200.0, "grad_norm": 3.1955261820278564, "language_loss": 0.76836932, "learning_rate": 3.999824660018126e-06, "loss": 0.79171705, "num_input_tokens_seen": 12018860, "step": 567, "time_per_iteration": 2.632741928100586 }, { "auxiliary_loss_clip": 0.01253596, "auxiliary_loss_mlp": 0.01083559, "balance_loss_clip": 1.06611466, "balance_loss_mlp": 1.05153918, "epoch": 0.03415000751540658, "flos": 28439096584320.0, "grad_norm": 2.115683621050472, "language_loss": 0.80834144, "learning_rate": 3.999819465118447e-06, "loss": 0.83171296, "num_input_tokens_seen": 12039675, "step": 568, "time_per_iteration": 2.7206337451934814 }, { "auxiliary_loss_clip": 0.01254921, "auxiliary_loss_mlp": 0.01082401, "balance_loss_clip": 1.06888509, "balance_loss_mlp": 1.04940367, "epoch": 0.034210130768074554, "flos": 21468727866240.0, "grad_norm": 1.891360159585894, "language_loss": 0.86560667, "learning_rate": 3.999814194385413e-06, "loss": 0.88897985, "num_input_tokens_seen": 12057680, "step": 569, "time_per_iteration": 2.7271673679351807 }, { "auxiliary_loss_clip": 0.01255135, "auxiliary_loss_mlp": 0.01082251, "balance_loss_clip": 1.06644094, "balance_loss_mlp": 1.04922962, "epoch": 0.03427025402074252, "flos": 18697501676160.0, "grad_norm": 1.6888504559193653, "language_loss": 0.95945716, "learning_rate": 3.9998088478192255e-06, "loss": 0.982831, "num_input_tokens_seen": 12076135, "step": 570, "time_per_iteration": 2.5918867588043213 }, { "auxiliary_loss_clip": 0.01255487, "auxiliary_loss_mlp": 0.0108066, "balance_loss_clip": 1.06228065, "balance_loss_mlp": 1.0435617, "epoch": 0.03433037727341049, "flos": 20849987162880.0, "grad_norm": 2.39132447086081, "language_loss": 0.7964232, "learning_rate": 3.9998034254200846e-06, "loss": 0.8197847, "num_input_tokens_seen": 12094785, "step": 571, "time_per_iteration": 2.590184450149536 }, { "auxiliary_loss_clip": 0.01256218, "auxiliary_loss_mlp": 0.01091484, "balance_loss_clip": 1.06740785, "balance_loss_mlp": 1.0565083, "epoch": 0.03439050052607846, "flos": 25410282986880.0, "grad_norm": 2.0738695690993, "language_loss": 0.80214274, "learning_rate": 3.999797927188199e-06, "loss": 0.82561976, "num_input_tokens_seen": 12114590, "step": 572, "time_per_iteration": 2.6862123012542725 }, { "auxiliary_loss_clip": 0.01263024, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.06995344, "balance_loss_mlp": 1.04098535, "epoch": 0.03445062377874643, "flos": 17640147997440.0, "grad_norm": 2.2324763929909284, "language_loss": 0.84548658, "learning_rate": 3.999792353123774e-06, "loss": 0.86885858, "num_input_tokens_seen": 12132390, "step": 573, "time_per_iteration": 2.78487229347229 }, { "auxiliary_loss_clip": 0.01256326, "auxiliary_loss_mlp": 0.01068789, "balance_loss_clip": 1.0644815, "balance_loss_mlp": 1.03781831, "epoch": 0.0345107470314144, "flos": 16764502245120.0, "grad_norm": 2.576428901855709, "language_loss": 0.76602584, "learning_rate": 3.999786703227023e-06, "loss": 0.78927696, "num_input_tokens_seen": 12149035, "step": 574, "time_per_iteration": 2.5697100162506104 }, { "auxiliary_loss_clip": 0.01255191, "auxiliary_loss_mlp": 0.0107671, "balance_loss_clip": 1.06581593, "balance_loss_mlp": 1.04502439, "epoch": 0.03457087028408237, "flos": 14684448533760.0, "grad_norm": 2.156110110571344, "language_loss": 0.83854586, "learning_rate": 3.9997809774981606e-06, "loss": 0.86186486, "num_input_tokens_seen": 12167530, "step": 575, "time_per_iteration": 2.596418619155884 }, { "auxiliary_loss_clip": 0.01249695, "auxiliary_loss_mlp": 0.01076053, "balance_loss_clip": 1.06684637, "balance_loss_mlp": 1.04334211, "epoch": 0.03463099353675034, "flos": 20011293527040.0, "grad_norm": 2.350120742735315, "language_loss": 0.83990753, "learning_rate": 3.9997751759374025e-06, "loss": 0.86316502, "num_input_tokens_seen": 12186340, "step": 576, "time_per_iteration": 5.821930646896362 }, { "auxiliary_loss_clip": 0.01257114, "auxiliary_loss_mlp": 0.01079503, "balance_loss_clip": 1.07237518, "balance_loss_mlp": 1.04817426, "epoch": 0.03469111678941831, "flos": 25301150490240.0, "grad_norm": 2.138457686407641, "language_loss": 0.85803086, "learning_rate": 3.99976929854497e-06, "loss": 0.88139701, "num_input_tokens_seen": 12204090, "step": 577, "time_per_iteration": 4.225277423858643 }, { "auxiliary_loss_clip": 0.01253845, "auxiliary_loss_mlp": 0.01080214, "balance_loss_clip": 1.06869018, "balance_loss_mlp": 1.04712176, "epoch": 0.034751240042086275, "flos": 23259413612160.0, "grad_norm": 4.535240156776142, "language_loss": 0.72226608, "learning_rate": 3.9997633453210845e-06, "loss": 0.74560666, "num_input_tokens_seen": 12224850, "step": 578, "time_per_iteration": 4.486239433288574 }, { "auxiliary_loss_clip": 0.01251871, "auxiliary_loss_mlp": 0.01080519, "balance_loss_clip": 1.06461096, "balance_loss_mlp": 1.04663968, "epoch": 0.03481136329475425, "flos": 23769237300480.0, "grad_norm": 1.9496379050984929, "language_loss": 0.77785492, "learning_rate": 3.999757316265973e-06, "loss": 0.80117887, "num_input_tokens_seen": 12244935, "step": 579, "time_per_iteration": 2.6706583499908447 }, { "auxiliary_loss_clip": 0.01251647, "auxiliary_loss_mlp": 0.01087497, "balance_loss_clip": 1.06656826, "balance_loss_mlp": 1.05435717, "epoch": 0.03487148654742222, "flos": 20157521794560.0, "grad_norm": 2.054973215074824, "language_loss": 0.86841297, "learning_rate": 3.999751211379863e-06, "loss": 0.8918044, "num_input_tokens_seen": 12262140, "step": 580, "time_per_iteration": 2.639146566390991 }, { "auxiliary_loss_clip": 0.01256528, "auxiliary_loss_mlp": 0.01069029, "balance_loss_clip": 1.06636667, "balance_loss_mlp": 1.0398469, "epoch": 0.034931609800090184, "flos": 15669585918720.0, "grad_norm": 2.205850105033732, "language_loss": 0.82570344, "learning_rate": 3.999745030662987e-06, "loss": 0.84895897, "num_input_tokens_seen": 12280930, "step": 581, "time_per_iteration": 2.6505649089813232 }, { "auxiliary_loss_clip": 0.01252942, "auxiliary_loss_mlp": 0.01072317, "balance_loss_clip": 1.06823969, "balance_loss_mlp": 1.04168022, "epoch": 0.034991733052758156, "flos": 16362374509440.0, "grad_norm": 2.1922492117358146, "language_loss": 0.7733047, "learning_rate": 3.99973877411558e-06, "loss": 0.79655731, "num_input_tokens_seen": 12299125, "step": 582, "time_per_iteration": 2.7323596477508545 }, { "auxiliary_loss_clip": 0.01250253, "auxiliary_loss_mlp": 0.01082356, "balance_loss_clip": 1.06794167, "balance_loss_mlp": 1.04861939, "epoch": 0.03505185630542612, "flos": 19387309438080.0, "grad_norm": 2.1536178016194327, "language_loss": 0.87679923, "learning_rate": 3.999732441737877e-06, "loss": 0.90012532, "num_input_tokens_seen": 12316905, "step": 583, "time_per_iteration": 2.6049294471740723 }, { "auxiliary_loss_clip": 0.01255473, "auxiliary_loss_mlp": 0.01092826, "balance_loss_clip": 1.06699181, "balance_loss_mlp": 1.06104505, "epoch": 0.03511197955809409, "flos": 21323828401920.0, "grad_norm": 3.7027110169592015, "language_loss": 0.81196821, "learning_rate": 3.99972603353012e-06, "loss": 0.83545119, "num_input_tokens_seen": 12335070, "step": 584, "time_per_iteration": 2.6011815071105957 }, { "auxiliary_loss_clip": 0.01251161, "auxiliary_loss_mlp": 0.01069463, "balance_loss_clip": 1.06472683, "balance_loss_mlp": 1.03832567, "epoch": 0.035172102810762065, "flos": 14136595320960.0, "grad_norm": 3.067717812226321, "language_loss": 0.92399198, "learning_rate": 3.999719549492551e-06, "loss": 0.94719815, "num_input_tokens_seen": 12350315, "step": 585, "time_per_iteration": 2.5592780113220215 }, { "auxiliary_loss_clip": 0.01251271, "auxiliary_loss_mlp": 0.01077423, "balance_loss_clip": 1.06562734, "balance_loss_mlp": 1.04552317, "epoch": 0.03523222606343003, "flos": 20296890564480.0, "grad_norm": 2.196660024103635, "language_loss": 0.87644351, "learning_rate": 3.9997129896254165e-06, "loss": 0.89973044, "num_input_tokens_seen": 12366030, "step": 586, "time_per_iteration": 2.5486221313476562 }, { "auxiliary_loss_clip": 0.01256485, "auxiliary_loss_mlp": 0.0108018, "balance_loss_clip": 1.06803596, "balance_loss_mlp": 1.04918551, "epoch": 0.035292349316098, "flos": 20375822701440.0, "grad_norm": 2.1222089199850878, "language_loss": 0.76079381, "learning_rate": 3.999706353928965e-06, "loss": 0.78416049, "num_input_tokens_seen": 12384895, "step": 587, "time_per_iteration": 2.5923714637756348 }, { "auxiliary_loss_clip": 0.01257125, "auxiliary_loss_mlp": 0.01068649, "balance_loss_clip": 1.06683922, "balance_loss_mlp": 1.03586686, "epoch": 0.03535247256876597, "flos": 21468871520640.0, "grad_norm": 2.212352192395094, "language_loss": 0.78601038, "learning_rate": 3.999699642403449e-06, "loss": 0.80926806, "num_input_tokens_seen": 12404980, "step": 588, "time_per_iteration": 2.579280138015747 }, { "auxiliary_loss_clip": 0.0125398, "auxiliary_loss_mlp": 0.0107827, "balance_loss_clip": 1.06582928, "balance_loss_mlp": 1.04367518, "epoch": 0.03541259582143394, "flos": 23623044946560.0, "grad_norm": 2.153589114745919, "language_loss": 0.94312829, "learning_rate": 3.99969285504912e-06, "loss": 0.96645081, "num_input_tokens_seen": 12423835, "step": 589, "time_per_iteration": 2.5964701175689697 }, { "auxiliary_loss_clip": 0.01256884, "auxiliary_loss_mlp": 0.01078108, "balance_loss_clip": 1.06697679, "balance_loss_mlp": 1.04666042, "epoch": 0.03547271907410191, "flos": 33726367768320.0, "grad_norm": 2.1162556876212695, "language_loss": 0.84116042, "learning_rate": 3.99968599186624e-06, "loss": 0.8645103, "num_input_tokens_seen": 12443135, "step": 590, "time_per_iteration": 2.746436357498169 }, { "auxiliary_loss_clip": 0.01249398, "auxiliary_loss_mlp": 0.01068452, "balance_loss_clip": 1.06658125, "balance_loss_mlp": 1.03893578, "epoch": 0.03553284232676988, "flos": 21142695093120.0, "grad_norm": 1.984522351394552, "language_loss": 0.8684091, "learning_rate": 3.999679052855065e-06, "loss": 0.89158762, "num_input_tokens_seen": 12462895, "step": 591, "time_per_iteration": 2.692303419113159 }, { "auxiliary_loss_clip": 0.01250641, "auxiliary_loss_mlp": 0.01082122, "balance_loss_clip": 1.06297326, "balance_loss_mlp": 1.04883862, "epoch": 0.03559296557943785, "flos": 20046593617920.0, "grad_norm": 2.0873185001780783, "language_loss": 0.83075488, "learning_rate": 3.999672038015861e-06, "loss": 0.85408247, "num_input_tokens_seen": 12481515, "step": 592, "time_per_iteration": 2.7822203636169434 }, { "auxiliary_loss_clip": 0.01146211, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.05013406, "balance_loss_mlp": 1.02676773, "epoch": 0.035653088832105814, "flos": 60334597244160.0, "grad_norm": 0.8804992705477848, "language_loss": 0.59754086, "learning_rate": 3.999664947348893e-06, "loss": 0.61934447, "num_input_tokens_seen": 12548220, "step": 593, "time_per_iteration": 3.274080276489258 }, { "auxiliary_loss_clip": 0.01249386, "auxiliary_loss_mlp": 0.0107742, "balance_loss_clip": 1.06737614, "balance_loss_mlp": 1.04473329, "epoch": 0.035713212084773786, "flos": 20113135562880.0, "grad_norm": 1.8086551314359374, "language_loss": 0.87077361, "learning_rate": 3.999657780854429e-06, "loss": 0.89404166, "num_input_tokens_seen": 12566105, "step": 594, "time_per_iteration": 2.682236671447754 }, { "auxiliary_loss_clip": 0.012487, "auxiliary_loss_mlp": 0.01082358, "balance_loss_clip": 1.06235993, "balance_loss_mlp": 1.05057716, "epoch": 0.03577333533744176, "flos": 26285785084800.0, "grad_norm": 5.516524335860627, "language_loss": 0.83920246, "learning_rate": 3.999650538532742e-06, "loss": 0.86251307, "num_input_tokens_seen": 12586680, "step": 595, "time_per_iteration": 2.773669481277466 }, { "auxiliary_loss_clip": 0.01248678, "auxiliary_loss_mlp": 0.01090544, "balance_loss_clip": 1.06579614, "balance_loss_mlp": 1.05850017, "epoch": 0.035833458590109724, "flos": 10889732211840.0, "grad_norm": 2.3448814752825204, "language_loss": 0.96041518, "learning_rate": 3.999643220384106e-06, "loss": 0.98380733, "num_input_tokens_seen": 12601605, "step": 596, "time_per_iteration": 2.6541590690612793 }, { "auxiliary_loss_clip": 0.01252662, "auxiliary_loss_mlp": 0.01081887, "balance_loss_clip": 1.0675534, "balance_loss_mlp": 1.05165553, "epoch": 0.035893581842777696, "flos": 22090198003200.0, "grad_norm": 2.4353221882859004, "language_loss": 0.82993281, "learning_rate": 3.999635826408799e-06, "loss": 0.85327828, "num_input_tokens_seen": 12620365, "step": 597, "time_per_iteration": 2.7023818492889404 }, { "auxiliary_loss_clip": 0.01247839, "auxiliary_loss_mlp": 0.01079829, "balance_loss_clip": 1.0668776, "balance_loss_mlp": 1.04766583, "epoch": 0.03595370509544566, "flos": 23038347358080.0, "grad_norm": 2.374757318483944, "language_loss": 0.81364304, "learning_rate": 3.999628356607101e-06, "loss": 0.83691972, "num_input_tokens_seen": 12641140, "step": 598, "time_per_iteration": 2.731229782104492 }, { "auxiliary_loss_clip": 0.01243692, "auxiliary_loss_mlp": 0.01077827, "balance_loss_clip": 1.0663228, "balance_loss_mlp": 1.04587913, "epoch": 0.03601382834811363, "flos": 20777734955520.0, "grad_norm": 1.817680341814684, "language_loss": 0.81172699, "learning_rate": 3.999620810979295e-06, "loss": 0.83494222, "num_input_tokens_seen": 12661080, "step": 599, "time_per_iteration": 2.710191011428833 }, { "auxiliary_loss_clip": 0.01250419, "auxiliary_loss_mlp": 0.01074577, "balance_loss_clip": 1.06356514, "balance_loss_mlp": 1.045228, "epoch": 0.036073951600781605, "flos": 23951627585280.0, "grad_norm": 2.3963649020429627, "language_loss": 0.8651731, "learning_rate": 3.999613189525668e-06, "loss": 0.88842309, "num_input_tokens_seen": 12678270, "step": 600, "time_per_iteration": 2.682262420654297 }, { "auxiliary_loss_clip": 0.01241882, "auxiliary_loss_mlp": 0.01084809, "balance_loss_clip": 1.05918193, "balance_loss_mlp": 1.05297971, "epoch": 0.03613407485344957, "flos": 18912283050240.0, "grad_norm": 2.0308947613075423, "language_loss": 0.82355881, "learning_rate": 3.999605492246508e-06, "loss": 0.84682572, "num_input_tokens_seen": 12697295, "step": 601, "time_per_iteration": 2.6570894718170166 }, { "auxiliary_loss_clip": 0.01240868, "auxiliary_loss_mlp": 0.010708, "balance_loss_clip": 1.06129336, "balance_loss_mlp": 1.03920949, "epoch": 0.03619419810611754, "flos": 23038526926080.0, "grad_norm": 2.3080142694085555, "language_loss": 0.7502507, "learning_rate": 3.999597719142107e-06, "loss": 0.77336735, "num_input_tokens_seen": 12716165, "step": 602, "time_per_iteration": 2.6434237957000732 }, { "auxiliary_loss_clip": 0.01239543, "auxiliary_loss_mlp": 0.01066859, "balance_loss_clip": 1.0604254, "balance_loss_mlp": 1.03562629, "epoch": 0.03625432135878551, "flos": 29457774293760.0, "grad_norm": 1.9681237382646195, "language_loss": 0.79599822, "learning_rate": 3.999589870212761e-06, "loss": 0.81906223, "num_input_tokens_seen": 12735475, "step": 603, "time_per_iteration": 2.7201666831970215 }, { "auxiliary_loss_clip": 0.01244834, "auxiliary_loss_mlp": 0.01071177, "balance_loss_clip": 1.06545615, "balance_loss_mlp": 1.04130292, "epoch": 0.03631444461145348, "flos": 23508525409920.0, "grad_norm": 1.8363641170913294, "language_loss": 0.86668456, "learning_rate": 3.9995819454587664e-06, "loss": 0.88984472, "num_input_tokens_seen": 12754540, "step": 604, "time_per_iteration": 2.60249924659729 }, { "auxiliary_loss_clip": 0.01248906, "auxiliary_loss_mlp": 0.01072985, "balance_loss_clip": 1.0674324, "balance_loss_mlp": 1.04010737, "epoch": 0.03637456786412145, "flos": 16618130323200.0, "grad_norm": 2.510130211393037, "language_loss": 0.80746496, "learning_rate": 3.999573944880424e-06, "loss": 0.83068383, "num_input_tokens_seen": 12773050, "step": 605, "time_per_iteration": 2.766684055328369 }, { "auxiliary_loss_clip": 0.01244274, "auxiliary_loss_mlp": 0.0107873, "balance_loss_clip": 1.0630821, "balance_loss_mlp": 1.04846251, "epoch": 0.03643469111678942, "flos": 15851832549120.0, "grad_norm": 2.2216143800596835, "language_loss": 0.85942292, "learning_rate": 3.9995658684780375e-06, "loss": 0.882653, "num_input_tokens_seen": 12791240, "step": 606, "time_per_iteration": 2.6133925914764404 }, { "auxiliary_loss_clip": 0.01247732, "auxiliary_loss_mlp": 0.01077404, "balance_loss_clip": 1.06413972, "balance_loss_mlp": 1.04588532, "epoch": 0.03649481436945739, "flos": 23620387340160.0, "grad_norm": 2.0684825764003394, "language_loss": 0.82179952, "learning_rate": 3.999557716251912e-06, "loss": 0.84505081, "num_input_tokens_seen": 12812245, "step": 607, "time_per_iteration": 2.6805856227874756 }, { "auxiliary_loss_clip": 0.01245394, "auxiliary_loss_mlp": 0.01073743, "balance_loss_clip": 1.06585169, "balance_loss_mlp": 1.04317796, "epoch": 0.036554937622125354, "flos": 21755581879680.0, "grad_norm": 2.3717179235904533, "language_loss": 0.83567071, "learning_rate": 3.999549488202358e-06, "loss": 0.8588621, "num_input_tokens_seen": 12831085, "step": 608, "time_per_iteration": 2.6593453884124756 }, { "auxiliary_loss_clip": 0.01251062, "auxiliary_loss_mlp": 0.01073705, "balance_loss_clip": 1.06682992, "balance_loss_mlp": 1.04006422, "epoch": 0.036615060874793326, "flos": 17819772935040.0, "grad_norm": 2.4795108668903305, "language_loss": 0.8201133, "learning_rate": 3.999541184329688e-06, "loss": 0.84336102, "num_input_tokens_seen": 12849115, "step": 609, "time_per_iteration": 2.6299383640289307 }, { "auxiliary_loss_clip": 0.01255655, "auxiliary_loss_mlp": 0.01091893, "balance_loss_clip": 1.07322037, "balance_loss_mlp": 1.06158984, "epoch": 0.0366751841274613, "flos": 26753808320640.0, "grad_norm": 1.992640540297191, "language_loss": 0.79448462, "learning_rate": 3.999532804634215e-06, "loss": 0.81796008, "num_input_tokens_seen": 12868005, "step": 610, "time_per_iteration": 2.65120530128479 }, { "auxiliary_loss_clip": 0.01254423, "auxiliary_loss_mlp": 0.01088228, "balance_loss_clip": 1.06914616, "balance_loss_mlp": 1.05656588, "epoch": 0.03673530738012926, "flos": 22196960202240.0, "grad_norm": 1.9328503999291824, "language_loss": 0.87282723, "learning_rate": 3.9995243491162575e-06, "loss": 0.89625371, "num_input_tokens_seen": 12886890, "step": 611, "time_per_iteration": 2.7398059368133545 }, { "auxiliary_loss_clip": 0.01248885, "auxiliary_loss_mlp": 0.01097673, "balance_loss_clip": 1.06917143, "balance_loss_mlp": 1.06651139, "epoch": 0.036795430632797235, "flos": 24681655601280.0, "grad_norm": 3.7435200854847266, "language_loss": 0.72589231, "learning_rate": 3.999515817776136e-06, "loss": 0.74935788, "num_input_tokens_seen": 12906130, "step": 612, "time_per_iteration": 2.700406551361084 }, { "auxiliary_loss_clip": 0.01249112, "auxiliary_loss_mlp": 0.01076924, "balance_loss_clip": 1.06581926, "balance_loss_mlp": 1.04480934, "epoch": 0.0368555538854652, "flos": 17748921358080.0, "grad_norm": 3.0863603820013434, "language_loss": 0.79110008, "learning_rate": 3.999507210614175e-06, "loss": 0.81436038, "num_input_tokens_seen": 12925260, "step": 613, "time_per_iteration": 2.630472183227539 }, { "auxiliary_loss_clip": 0.01242581, "auxiliary_loss_mlp": 0.01090278, "balance_loss_clip": 1.06378841, "balance_loss_mlp": 1.05961776, "epoch": 0.03691567713813317, "flos": 20594554571520.0, "grad_norm": 2.2015687298668336, "language_loss": 0.93885028, "learning_rate": 3.9994985276307e-06, "loss": 0.96217889, "num_input_tokens_seen": 12944590, "step": 614, "time_per_iteration": 2.6977972984313965 }, { "auxiliary_loss_clip": 0.01254503, "auxiliary_loss_mlp": 0.01081137, "balance_loss_clip": 1.07009673, "balance_loss_mlp": 1.04732919, "epoch": 0.036975800390801145, "flos": 33650380546560.0, "grad_norm": 3.0661216019279576, "language_loss": 0.72932875, "learning_rate": 3.999489768826041e-06, "loss": 0.75268513, "num_input_tokens_seen": 12964785, "step": 615, "time_per_iteration": 2.697291612625122 }, { "auxiliary_loss_clip": 0.01250213, "auxiliary_loss_mlp": 0.010716, "balance_loss_clip": 1.06649876, "balance_loss_mlp": 1.04015231, "epoch": 0.03703592364346911, "flos": 28293694329600.0, "grad_norm": 2.9941392641088695, "language_loss": 0.81630868, "learning_rate": 3.999480934200528e-06, "loss": 0.83952683, "num_input_tokens_seen": 12986705, "step": 616, "time_per_iteration": 4.1762495040893555 }, { "auxiliary_loss_clip": 0.0124999, "auxiliary_loss_mlp": 0.01076541, "balance_loss_clip": 1.06807041, "balance_loss_mlp": 1.0467627, "epoch": 0.03709604689613708, "flos": 31504215853440.0, "grad_norm": 2.320593216419041, "language_loss": 0.68178958, "learning_rate": 3.999472023754499e-06, "loss": 0.70505488, "num_input_tokens_seen": 13010560, "step": 617, "time_per_iteration": 4.224538564682007 }, { "auxiliary_loss_clip": 0.01254259, "auxiliary_loss_mlp": 0.010771, "balance_loss_clip": 1.07098567, "balance_loss_mlp": 1.04415071, "epoch": 0.03715617014880505, "flos": 19609381272960.0, "grad_norm": 2.245411088847763, "language_loss": 0.80595517, "learning_rate": 3.99946303748829e-06, "loss": 0.82926875, "num_input_tokens_seen": 13028935, "step": 618, "time_per_iteration": 4.200341463088989 }, { "auxiliary_loss_clip": 0.01257669, "auxiliary_loss_mlp": 0.01079294, "balance_loss_clip": 1.06808555, "balance_loss_mlp": 1.04605901, "epoch": 0.03721629340147302, "flos": 15924192497280.0, "grad_norm": 10.155035046705617, "language_loss": 0.91591841, "learning_rate": 3.999453975402242e-06, "loss": 0.93928802, "num_input_tokens_seen": 13046000, "step": 619, "time_per_iteration": 2.5787301063537598 }, { "auxiliary_loss_clip": 0.01251145, "auxiliary_loss_mlp": 0.01083548, "balance_loss_clip": 1.06999123, "balance_loss_mlp": 1.05181432, "epoch": 0.03727641665414099, "flos": 21104090951040.0, "grad_norm": 2.0803022158745406, "language_loss": 0.94071603, "learning_rate": 3.9994448374967e-06, "loss": 0.96406299, "num_input_tokens_seen": 13062995, "step": 620, "time_per_iteration": 2.5987205505371094 }, { "auxiliary_loss_clip": 0.01249568, "auxiliary_loss_mlp": 0.0108317, "balance_loss_clip": 1.06624317, "balance_loss_mlp": 1.0502919, "epoch": 0.037336539906808956, "flos": 24131683486080.0, "grad_norm": 1.7431896174296577, "language_loss": 0.77319217, "learning_rate": 3.999435623772008e-06, "loss": 0.79651952, "num_input_tokens_seen": 13084120, "step": 621, "time_per_iteration": 2.68758225440979 }, { "auxiliary_loss_clip": 0.01247252, "auxiliary_loss_mlp": 0.01071013, "balance_loss_clip": 1.06894088, "balance_loss_mlp": 1.03792048, "epoch": 0.03739666315947693, "flos": 22346384780160.0, "grad_norm": 2.3852872810563364, "language_loss": 0.86546707, "learning_rate": 3.999426334228518e-06, "loss": 0.88864976, "num_input_tokens_seen": 13100035, "step": 622, "time_per_iteration": 2.607121467590332 }, { "auxiliary_loss_clip": 0.012499, "auxiliary_loss_mlp": 0.01072461, "balance_loss_clip": 1.06715882, "balance_loss_mlp": 1.04048872, "epoch": 0.0374567864121449, "flos": 20449511452800.0, "grad_norm": 2.2621736327299766, "language_loss": 0.90008956, "learning_rate": 3.999416968866581e-06, "loss": 0.92331314, "num_input_tokens_seen": 13118070, "step": 623, "time_per_iteration": 2.6513512134552 }, { "auxiliary_loss_clip": 0.01251762, "auxiliary_loss_mlp": 0.01090534, "balance_loss_clip": 1.07006013, "balance_loss_mlp": 1.05844235, "epoch": 0.037516909664812866, "flos": 19208043636480.0, "grad_norm": 2.760597076727266, "language_loss": 0.84095174, "learning_rate": 3.999407527686551e-06, "loss": 0.8643747, "num_input_tokens_seen": 13136355, "step": 624, "time_per_iteration": 2.66623592376709 }, { "auxiliary_loss_clip": 0.01252431, "auxiliary_loss_mlp": 0.01076353, "balance_loss_clip": 1.06697702, "balance_loss_mlp": 1.04423809, "epoch": 0.03757703291748084, "flos": 35005218664320.0, "grad_norm": 4.259276014089895, "language_loss": 0.66778994, "learning_rate": 3.999398010688788e-06, "loss": 0.69107783, "num_input_tokens_seen": 13155435, "step": 625, "time_per_iteration": 2.7288877964019775 }, { "auxiliary_loss_clip": 0.01244959, "auxiliary_loss_mlp": 0.01076274, "balance_loss_clip": 1.06605244, "balance_loss_mlp": 1.042943, "epoch": 0.0376371561701488, "flos": 25483899911040.0, "grad_norm": 3.375450269409945, "language_loss": 0.77496696, "learning_rate": 3.999388417873652e-06, "loss": 0.79817927, "num_input_tokens_seen": 13174295, "step": 626, "time_per_iteration": 2.648942470550537 }, { "auxiliary_loss_clip": 0.01249107, "auxiliary_loss_mlp": 0.0108376, "balance_loss_clip": 1.06770003, "balance_loss_mlp": 1.05200303, "epoch": 0.037697279422816775, "flos": 18185630912640.0, "grad_norm": 2.0480468386724766, "language_loss": 0.81463408, "learning_rate": 3.999378749241506e-06, "loss": 0.83796275, "num_input_tokens_seen": 13192500, "step": 627, "time_per_iteration": 2.6209845542907715 }, { "auxiliary_loss_clip": 0.01254363, "auxiliary_loss_mlp": 0.01084942, "balance_loss_clip": 1.07041132, "balance_loss_mlp": 1.05215955, "epoch": 0.03775740267548475, "flos": 24644272521600.0, "grad_norm": 1.6934072791943036, "language_loss": 0.88809037, "learning_rate": 3.999369004792719e-06, "loss": 0.91148341, "num_input_tokens_seen": 13213470, "step": 628, "time_per_iteration": 2.7221415042877197 }, { "auxiliary_loss_clip": 0.01247303, "auxiliary_loss_mlp": 0.01080197, "balance_loss_clip": 1.0627017, "balance_loss_mlp": 1.04765344, "epoch": 0.03781752592815271, "flos": 21288205088640.0, "grad_norm": 2.536151380104699, "language_loss": 0.79840028, "learning_rate": 3.999359184527658e-06, "loss": 0.82167524, "num_input_tokens_seen": 13232365, "step": 629, "time_per_iteration": 2.6535024642944336 }, { "auxiliary_loss_clip": 0.01249218, "auxiliary_loss_mlp": 0.0106958, "balance_loss_clip": 1.06675959, "balance_loss_mlp": 1.03885961, "epoch": 0.037877649180820684, "flos": 22089623385600.0, "grad_norm": 1.6861994278356789, "language_loss": 0.76824844, "learning_rate": 3.999349288446696e-06, "loss": 0.79143643, "num_input_tokens_seen": 13251920, "step": 630, "time_per_iteration": 2.6175966262817383 }, { "auxiliary_loss_clip": 0.01254291, "auxiliary_loss_mlp": 0.01075963, "balance_loss_clip": 1.06833327, "balance_loss_mlp": 1.04504025, "epoch": 0.03793777243348865, "flos": 14501339976960.0, "grad_norm": 3.12435515576561, "language_loss": 0.91593724, "learning_rate": 3.99933931655021e-06, "loss": 0.93923974, "num_input_tokens_seen": 13267440, "step": 631, "time_per_iteration": 2.565293788909912 }, { "auxiliary_loss_clip": 0.01243525, "auxiliary_loss_mlp": 0.01087901, "balance_loss_clip": 1.06386209, "balance_loss_mlp": 1.05356884, "epoch": 0.03799789568615662, "flos": 21908418249600.0, "grad_norm": 1.6822536287963328, "language_loss": 0.92157543, "learning_rate": 3.999329268838575e-06, "loss": 0.94488978, "num_input_tokens_seen": 13287850, "step": 632, "time_per_iteration": 2.6235203742980957 }, { "auxiliary_loss_clip": 0.01248362, "auxiliary_loss_mlp": 0.01067296, "balance_loss_clip": 1.06696796, "balance_loss_mlp": 1.03613472, "epoch": 0.03805801893882459, "flos": 24827021942400.0, "grad_norm": 2.1097171792430456, "language_loss": 0.83139223, "learning_rate": 3.999319145312175e-06, "loss": 0.85454881, "num_input_tokens_seen": 13307760, "step": 633, "time_per_iteration": 2.6461985111236572 }, { "auxiliary_loss_clip": 0.01247735, "auxiliary_loss_mlp": 0.01079895, "balance_loss_clip": 1.06473529, "balance_loss_mlp": 1.04811358, "epoch": 0.03811814219149256, "flos": 30482952364800.0, "grad_norm": 1.599115294194595, "language_loss": 0.69883299, "learning_rate": 3.999308945971392e-06, "loss": 0.72210932, "num_input_tokens_seen": 13331230, "step": 634, "time_per_iteration": 2.709033727645874 }, { "auxiliary_loss_clip": 0.01133204, "auxiliary_loss_mlp": 0.01009504, "balance_loss_clip": 1.04124916, "balance_loss_mlp": 1.00249422, "epoch": 0.03817826544416053, "flos": 66992577379200.0, "grad_norm": 0.893126545279708, "language_loss": 0.61645919, "learning_rate": 3.999298670816614e-06, "loss": 0.63788629, "num_input_tokens_seen": 13394760, "step": 635, "time_per_iteration": 3.2099475860595703 }, { "auxiliary_loss_clip": 0.01244276, "auxiliary_loss_mlp": 0.01072984, "balance_loss_clip": 1.06475401, "balance_loss_mlp": 1.04129851, "epoch": 0.038238388696828496, "flos": 20485350247680.0, "grad_norm": 2.0563589539657205, "language_loss": 0.83629507, "learning_rate": 3.9992883198482294e-06, "loss": 0.85946769, "num_input_tokens_seen": 13412775, "step": 636, "time_per_iteration": 2.6278960704803467 }, { "auxiliary_loss_clip": 0.01248078, "auxiliary_loss_mlp": 0.01096471, "balance_loss_clip": 1.06714165, "balance_loss_mlp": 1.06530952, "epoch": 0.03829851194949647, "flos": 17965893461760.0, "grad_norm": 2.346379148367956, "language_loss": 0.79578567, "learning_rate": 3.999277893066632e-06, "loss": 0.81923115, "num_input_tokens_seen": 13427835, "step": 637, "time_per_iteration": 2.646414279937744 }, { "auxiliary_loss_clip": 0.01247939, "auxiliary_loss_mlp": 0.01088528, "balance_loss_clip": 1.06356907, "balance_loss_mlp": 1.0562222, "epoch": 0.03835863520216444, "flos": 22456522857600.0, "grad_norm": 1.9563283234999833, "language_loss": 0.83989692, "learning_rate": 3.999267390472215e-06, "loss": 0.86326158, "num_input_tokens_seen": 13447295, "step": 638, "time_per_iteration": 2.6416285037994385 }, { "auxiliary_loss_clip": 0.01253172, "auxiliary_loss_mlp": 0.01074704, "balance_loss_clip": 1.06563985, "balance_loss_mlp": 1.04163575, "epoch": 0.038418758454832405, "flos": 22164425458560.0, "grad_norm": 2.5596504471077224, "language_loss": 0.70109725, "learning_rate": 3.999256812065381e-06, "loss": 0.72437602, "num_input_tokens_seen": 13468455, "step": 639, "time_per_iteration": 2.610682487487793 }, { "auxiliary_loss_clip": 0.01248829, "auxiliary_loss_mlp": 0.01081808, "balance_loss_clip": 1.06618333, "balance_loss_mlp": 1.04790449, "epoch": 0.03847888170750038, "flos": 22747435107840.0, "grad_norm": 2.5791624605537082, "language_loss": 0.85322344, "learning_rate": 3.999246157846526e-06, "loss": 0.87652987, "num_input_tokens_seen": 13489085, "step": 640, "time_per_iteration": 2.700456380844116 }, { "auxiliary_loss_clip": 0.01252579, "auxiliary_loss_mlp": 0.01083722, "balance_loss_clip": 1.06751871, "balance_loss_mlp": 1.04934239, "epoch": 0.03853900496016834, "flos": 22711201263360.0, "grad_norm": 2.331268680461456, "language_loss": 0.82141805, "learning_rate": 3.9992354278160574e-06, "loss": 0.84478104, "num_input_tokens_seen": 13509120, "step": 641, "time_per_iteration": 2.6572046279907227 }, { "auxiliary_loss_clip": 0.0112759, "auxiliary_loss_mlp": 0.01008008, "balance_loss_clip": 1.03825259, "balance_loss_mlp": 1.00095105, "epoch": 0.038599128212836314, "flos": 70399136355840.0, "grad_norm": 0.9037629700551453, "language_loss": 0.65444964, "learning_rate": 3.999224621974381e-06, "loss": 0.67580563, "num_input_tokens_seen": 13562005, "step": 642, "time_per_iteration": 3.199925422668457 }, { "auxiliary_loss_clip": 0.01246698, "auxiliary_loss_mlp": 0.01064563, "balance_loss_clip": 1.0651319, "balance_loss_mlp": 1.03453398, "epoch": 0.03865925146550429, "flos": 23295144666240.0, "grad_norm": 1.9113268312481755, "language_loss": 0.79272145, "learning_rate": 3.999213740321906e-06, "loss": 0.81583405, "num_input_tokens_seen": 13582185, "step": 643, "time_per_iteration": 2.641437292098999 }, { "auxiliary_loss_clip": 0.01244786, "auxiliary_loss_mlp": 0.01076057, "balance_loss_clip": 1.06219232, "balance_loss_mlp": 1.04599261, "epoch": 0.03871937471817225, "flos": 21430446946560.0, "grad_norm": 2.2104774200729262, "language_loss": 0.8294487, "learning_rate": 3.999202782859046e-06, "loss": 0.85265714, "num_input_tokens_seen": 13599555, "step": 644, "time_per_iteration": 2.600558280944824 }, { "auxiliary_loss_clip": 0.01247273, "auxiliary_loss_mlp": 0.01074554, "balance_loss_clip": 1.06383467, "balance_loss_mlp": 1.04193854, "epoch": 0.038779497970840224, "flos": 34277309550720.0, "grad_norm": 1.994902925690418, "language_loss": 0.82286513, "learning_rate": 3.9991917495862165e-06, "loss": 0.8460834, "num_input_tokens_seen": 13621160, "step": 645, "time_per_iteration": 2.6751983165740967 }, { "auxiliary_loss_clip": 0.01248631, "auxiliary_loss_mlp": 0.01070807, "balance_loss_clip": 1.06525111, "balance_loss_mlp": 1.03890657, "epoch": 0.03883962122350819, "flos": 22748189293440.0, "grad_norm": 2.290384247239265, "language_loss": 0.81889713, "learning_rate": 3.9991806405038345e-06, "loss": 0.84209144, "num_input_tokens_seen": 13641915, "step": 646, "time_per_iteration": 2.6987667083740234 }, { "auxiliary_loss_clip": 0.01250204, "auxiliary_loss_mlp": 0.01078836, "balance_loss_clip": 1.06982899, "balance_loss_mlp": 1.04791331, "epoch": 0.03889974447617616, "flos": 21945837242880.0, "grad_norm": 1.9171219640425325, "language_loss": 0.82015383, "learning_rate": 3.999169455612323e-06, "loss": 0.84344423, "num_input_tokens_seen": 13661410, "step": 647, "time_per_iteration": 2.590102195739746 }, { "auxiliary_loss_clip": 0.0124696, "auxiliary_loss_mlp": 0.01072111, "balance_loss_clip": 1.06628954, "balance_loss_mlp": 1.04216528, "epoch": 0.03895986772884413, "flos": 31504826384640.0, "grad_norm": 1.9398424653049293, "language_loss": 0.84477997, "learning_rate": 3.999158194912106e-06, "loss": 0.86797059, "num_input_tokens_seen": 13681705, "step": 648, "time_per_iteration": 2.7516121864318848 }, { "auxiliary_loss_clip": 0.01244808, "auxiliary_loss_mlp": 0.0107293, "balance_loss_clip": 1.06524062, "balance_loss_mlp": 1.04210222, "epoch": 0.0390199909815121, "flos": 19901011795200.0, "grad_norm": 2.3870859420748136, "language_loss": 0.84254295, "learning_rate": 3.9991468584036086e-06, "loss": 0.86572027, "num_input_tokens_seen": 13700400, "step": 649, "time_per_iteration": 2.6116180419921875 }, { "auxiliary_loss_clip": 0.01246653, "auxiliary_loss_mlp": 0.01073574, "balance_loss_clip": 1.06560743, "balance_loss_mlp": 1.0416739, "epoch": 0.03908011423418007, "flos": 21612478095360.0, "grad_norm": 2.00775905451926, "language_loss": 0.79783499, "learning_rate": 3.999135446087263e-06, "loss": 0.82103723, "num_input_tokens_seen": 13720145, "step": 650, "time_per_iteration": 2.574939727783203 }, { "auxiliary_loss_clip": 0.01242721, "auxiliary_loss_mlp": 0.01077536, "balance_loss_clip": 1.06209707, "balance_loss_mlp": 1.04534984, "epoch": 0.039140237486848035, "flos": 18661411486080.0, "grad_norm": 2.334811800093409, "language_loss": 0.78698987, "learning_rate": 3.9991239579635e-06, "loss": 0.81019247, "num_input_tokens_seen": 13737500, "step": 651, "time_per_iteration": 2.5930917263031006 }, { "auxiliary_loss_clip": 0.0124425, "auxiliary_loss_mlp": 0.010838, "balance_loss_clip": 1.06317663, "balance_loss_mlp": 1.05087411, "epoch": 0.03920036073951601, "flos": 18661124177280.0, "grad_norm": 3.361008988618244, "language_loss": 0.87392938, "learning_rate": 3.999112394032757e-06, "loss": 0.89720988, "num_input_tokens_seen": 13754750, "step": 652, "time_per_iteration": 2.6072869300842285 }, { "auxiliary_loss_clip": 0.01239638, "auxiliary_loss_mlp": 0.01073938, "balance_loss_clip": 1.06362963, "balance_loss_mlp": 1.0434916, "epoch": 0.03926048399218398, "flos": 31354468053120.0, "grad_norm": 2.6218665998754904, "language_loss": 0.79297256, "learning_rate": 3.999100754295471e-06, "loss": 0.81610829, "num_input_tokens_seen": 13771990, "step": 653, "time_per_iteration": 2.626145362854004 }, { "auxiliary_loss_clip": 0.01250652, "auxiliary_loss_mlp": 0.01075546, "balance_loss_clip": 1.06496143, "balance_loss_mlp": 1.04374111, "epoch": 0.039320607244851945, "flos": 29603499770880.0, "grad_norm": 2.0720296605490094, "language_loss": 0.85909009, "learning_rate": 3.999089038752085e-06, "loss": 0.88235211, "num_input_tokens_seen": 13792750, "step": 654, "time_per_iteration": 2.6775124073028564 }, { "auxiliary_loss_clip": 0.01126661, "auxiliary_loss_mlp": 0.01016641, "balance_loss_clip": 1.03977203, "balance_loss_mlp": 1.01001298, "epoch": 0.03938073049751992, "flos": 66534609951360.0, "grad_norm": 0.7366259780501333, "language_loss": 0.4997997, "learning_rate": 3.999077247403041e-06, "loss": 0.52123272, "num_input_tokens_seen": 13858570, "step": 655, "time_per_iteration": 3.3006510734558105 }, { "auxiliary_loss_clip": 0.01241143, "auxiliary_loss_mlp": 0.01076374, "balance_loss_clip": 1.0658412, "balance_loss_mlp": 1.04680991, "epoch": 0.03944085375018788, "flos": 23367827836800.0, "grad_norm": 4.17474796245144, "language_loss": 0.80903178, "learning_rate": 3.9990653802487886e-06, "loss": 0.83220696, "num_input_tokens_seen": 13876335, "step": 656, "time_per_iteration": 4.228931427001953 }, { "auxiliary_loss_clip": 0.01251519, "auxiliary_loss_mlp": 0.01093573, "balance_loss_clip": 1.06740427, "balance_loss_mlp": 1.05802524, "epoch": 0.039500977002855854, "flos": 18548292579840.0, "grad_norm": 2.068956760077258, "language_loss": 0.76289558, "learning_rate": 3.999053437289776e-06, "loss": 0.7863465, "num_input_tokens_seen": 13892640, "step": 657, "time_per_iteration": 4.218473434448242 }, { "auxiliary_loss_clip": 0.0124824, "auxiliary_loss_mlp": 0.01076812, "balance_loss_clip": 1.06641233, "balance_loss_mlp": 1.04522133, "epoch": 0.039561100255523826, "flos": 25338174433920.0, "grad_norm": 2.07475431213476, "language_loss": 0.8179062, "learning_rate": 3.999041418526457e-06, "loss": 0.84115672, "num_input_tokens_seen": 13910085, "step": 658, "time_per_iteration": 2.671675682067871 }, { "auxiliary_loss_clip": 0.01242678, "auxiliary_loss_mlp": 0.01077963, "balance_loss_clip": 1.06347871, "balance_loss_mlp": 1.0454669, "epoch": 0.03962122350819179, "flos": 18219889509120.0, "grad_norm": 2.2444983110753625, "language_loss": 0.90790772, "learning_rate": 3.999029323959287e-06, "loss": 0.93111408, "num_input_tokens_seen": 13928800, "step": 659, "time_per_iteration": 4.2601988315582275 }, { "auxiliary_loss_clip": 0.01247633, "auxiliary_loss_mlp": 0.01073069, "balance_loss_clip": 1.06654835, "balance_loss_mlp": 1.04215825, "epoch": 0.03968134676085976, "flos": 20522230536960.0, "grad_norm": 2.2083626038373656, "language_loss": 0.79760063, "learning_rate": 3.999017153588724e-06, "loss": 0.82080764, "num_input_tokens_seen": 13948325, "step": 660, "time_per_iteration": 2.62716007232666 }, { "auxiliary_loss_clip": 0.01246027, "auxiliary_loss_mlp": 0.01077579, "balance_loss_clip": 1.0675652, "balance_loss_mlp": 1.0456785, "epoch": 0.03974147001352773, "flos": 22422587483520.0, "grad_norm": 1.6747851381362888, "language_loss": 0.81757367, "learning_rate": 3.999004907415231e-06, "loss": 0.8408097, "num_input_tokens_seen": 13969090, "step": 661, "time_per_iteration": 2.645423412322998 }, { "auxiliary_loss_clip": 0.01119895, "auxiliary_loss_mlp": 0.01007167, "balance_loss_clip": 1.03320217, "balance_loss_mlp": 1.00077713, "epoch": 0.0398015932661957, "flos": 71128769322240.0, "grad_norm": 0.9117564509831767, "language_loss": 0.69349593, "learning_rate": 3.998992585439272e-06, "loss": 0.71476656, "num_input_tokens_seen": 14037555, "step": 662, "time_per_iteration": 3.3032331466674805 }, { "auxiliary_loss_clip": 0.01249217, "auxiliary_loss_mlp": 0.01074722, "balance_loss_clip": 1.06995225, "balance_loss_mlp": 1.04322648, "epoch": 0.03986171651886367, "flos": 16800951571200.0, "grad_norm": 2.160679749799672, "language_loss": 0.82765651, "learning_rate": 3.998980187661314e-06, "loss": 0.85089582, "num_input_tokens_seen": 14055765, "step": 663, "time_per_iteration": 2.6217782497406006 }, { "auxiliary_loss_clip": 0.01252759, "auxiliary_loss_mlp": 0.01063705, "balance_loss_clip": 1.06966817, "balance_loss_mlp": 1.03254378, "epoch": 0.03992183977153164, "flos": 24535068197760.0, "grad_norm": 2.19374813563436, "language_loss": 0.87302262, "learning_rate": 3.998967714081826e-06, "loss": 0.89618725, "num_input_tokens_seen": 14074195, "step": 664, "time_per_iteration": 2.6729183197021484 }, { "auxiliary_loss_clip": 0.01241647, "auxiliary_loss_mlp": 0.0106515, "balance_loss_clip": 1.06656313, "balance_loss_mlp": 1.03346384, "epoch": 0.03998196302419961, "flos": 15595897167360.0, "grad_norm": 2.036983550581997, "language_loss": 0.84821391, "learning_rate": 3.998955164701281e-06, "loss": 0.87128186, "num_input_tokens_seen": 14090215, "step": 665, "time_per_iteration": 2.593832015991211 }, { "auxiliary_loss_clip": 0.012521, "auxiliary_loss_mlp": 0.01085682, "balance_loss_clip": 1.06867695, "balance_loss_mlp": 1.05223155, "epoch": 0.04004208627686758, "flos": 25305065072640.0, "grad_norm": 2.172699570421913, "language_loss": 0.81745672, "learning_rate": 3.998942539520158e-06, "loss": 0.8408345, "num_input_tokens_seen": 14112150, "step": 666, "time_per_iteration": 2.6743290424346924 }, { "auxiliary_loss_clip": 0.01241565, "auxiliary_loss_mlp": 0.01073617, "balance_loss_clip": 1.06443083, "balance_loss_mlp": 1.04007161, "epoch": 0.04010220952953555, "flos": 23475847011840.0, "grad_norm": 2.1003520396389828, "language_loss": 0.87117827, "learning_rate": 3.998929838538932e-06, "loss": 0.89433014, "num_input_tokens_seen": 14131475, "step": 667, "time_per_iteration": 2.6147067546844482 }, { "auxiliary_loss_clip": 0.0124275, "auxiliary_loss_mlp": 0.01071583, "balance_loss_clip": 1.07009172, "balance_loss_mlp": 1.04161382, "epoch": 0.04016233278220352, "flos": 18617025254400.0, "grad_norm": 2.331266403294307, "language_loss": 0.80641299, "learning_rate": 3.998917061758087e-06, "loss": 0.82955635, "num_input_tokens_seen": 14146165, "step": 668, "time_per_iteration": 2.6015820503234863 }, { "auxiliary_loss_clip": 0.01115034, "auxiliary_loss_mlp": 0.01008949, "balance_loss_clip": 1.02975297, "balance_loss_mlp": 1.00317907, "epoch": 0.040222456034871484, "flos": 70906194696960.0, "grad_norm": 0.7870483750596657, "language_loss": 0.60066259, "learning_rate": 3.998904209178107e-06, "loss": 0.62190247, "num_input_tokens_seen": 14215005, "step": 669, "time_per_iteration": 3.2993202209472656 }, { "auxiliary_loss_clip": 0.01242272, "auxiliary_loss_mlp": 0.01071485, "balance_loss_clip": 1.06408751, "balance_loss_mlp": 1.04120564, "epoch": 0.040282579287539456, "flos": 23764712186880.0, "grad_norm": 1.7022357666604506, "language_loss": 0.86290276, "learning_rate": 3.9988912807994785e-06, "loss": 0.88604033, "num_input_tokens_seen": 14235510, "step": 670, "time_per_iteration": 2.700657844543457 }, { "auxiliary_loss_clip": 0.01242087, "auxiliary_loss_mlp": 0.01080448, "balance_loss_clip": 1.06647801, "balance_loss_mlp": 1.05014467, "epoch": 0.04034270254020743, "flos": 18478518410880.0, "grad_norm": 1.8224152334464152, "language_loss": 0.75569212, "learning_rate": 3.998878276622692e-06, "loss": 0.77891749, "num_input_tokens_seen": 14254565, "step": 671, "time_per_iteration": 2.6698572635650635 }, { "auxiliary_loss_clip": 0.01248936, "auxiliary_loss_mlp": 0.01076667, "balance_loss_clip": 1.06943047, "balance_loss_mlp": 1.04605412, "epoch": 0.040402825792875394, "flos": 17201858244480.0, "grad_norm": 1.9730812981627939, "language_loss": 0.92416775, "learning_rate": 3.998865196648242e-06, "loss": 0.94742376, "num_input_tokens_seen": 14271885, "step": 672, "time_per_iteration": 2.567563533782959 }, { "auxiliary_loss_clip": 0.01245231, "auxiliary_loss_mlp": 0.010776, "balance_loss_clip": 1.0677104, "balance_loss_mlp": 1.04422188, "epoch": 0.040462949045543366, "flos": 19172168928000.0, "grad_norm": 1.800141829654062, "language_loss": 0.90174723, "learning_rate": 3.998852040876622e-06, "loss": 0.92497551, "num_input_tokens_seen": 14289670, "step": 673, "time_per_iteration": 2.547154426574707 }, { "auxiliary_loss_clip": 0.01239752, "auxiliary_loss_mlp": 0.01084248, "balance_loss_clip": 1.06466973, "balance_loss_mlp": 1.05184698, "epoch": 0.04052307229821133, "flos": 24019821555840.0, "grad_norm": 2.3989934860433486, "language_loss": 0.75016737, "learning_rate": 3.998838809308334e-06, "loss": 0.7734074, "num_input_tokens_seen": 14309285, "step": 674, "time_per_iteration": 2.681896924972534 }, { "auxiliary_loss_clip": 0.01249861, "auxiliary_loss_mlp": 0.01064308, "balance_loss_clip": 1.06744063, "balance_loss_mlp": 1.03334963, "epoch": 0.0405831955508793, "flos": 16436601964800.0, "grad_norm": 2.55613513039197, "language_loss": 0.78289407, "learning_rate": 3.9988255019438766e-06, "loss": 0.80603576, "num_input_tokens_seen": 14328300, "step": 675, "time_per_iteration": 2.6965043544769287 }, { "auxiliary_loss_clip": 0.01241749, "auxiliary_loss_mlp": 0.01079652, "balance_loss_clip": 1.06532836, "balance_loss_mlp": 1.04648817, "epoch": 0.040643318803547275, "flos": 24279922915200.0, "grad_norm": 2.047384767684118, "language_loss": 0.76844448, "learning_rate": 3.998812118783757e-06, "loss": 0.79165846, "num_input_tokens_seen": 14346395, "step": 676, "time_per_iteration": 2.6216623783111572 }, { "auxiliary_loss_clip": 0.01248147, "auxiliary_loss_mlp": 0.01079294, "balance_loss_clip": 1.06811619, "balance_loss_mlp": 1.04813254, "epoch": 0.04070344205621524, "flos": 17712076982400.0, "grad_norm": 2.318905665785744, "language_loss": 0.85139382, "learning_rate": 3.9987986598284804e-06, "loss": 0.8746683, "num_input_tokens_seen": 14364605, "step": 677, "time_per_iteration": 2.5663015842437744 }, { "auxiliary_loss_clip": 0.01240385, "auxiliary_loss_mlp": 0.01070741, "balance_loss_clip": 1.06558609, "balance_loss_mlp": 1.03901923, "epoch": 0.04076356530888321, "flos": 26177658168960.0, "grad_norm": 2.5041724349122645, "language_loss": 0.76572061, "learning_rate": 3.998785125078559e-06, "loss": 0.78883183, "num_input_tokens_seen": 14385265, "step": 678, "time_per_iteration": 2.624689817428589 }, { "auxiliary_loss_clip": 0.01240972, "auxiliary_loss_mlp": 0.01072606, "balance_loss_clip": 1.06374967, "balance_loss_mlp": 1.04242194, "epoch": 0.04082368856155118, "flos": 35773455772800.0, "grad_norm": 1.7096242150987748, "language_loss": 0.82139099, "learning_rate": 3.998771514534505e-06, "loss": 0.84452677, "num_input_tokens_seen": 14406090, "step": 679, "time_per_iteration": 2.7073023319244385 }, { "auxiliary_loss_clip": 0.01248879, "auxiliary_loss_mlp": 0.01064116, "balance_loss_clip": 1.07185793, "balance_loss_mlp": 1.0340035, "epoch": 0.04088381181421915, "flos": 28146640049280.0, "grad_norm": 1.963288262989073, "language_loss": 0.76260424, "learning_rate": 3.998757828196835e-06, "loss": 0.78573418, "num_input_tokens_seen": 14425130, "step": 680, "time_per_iteration": 2.6767218112945557 }, { "auxiliary_loss_clip": 0.01244441, "auxiliary_loss_mlp": 0.01071738, "balance_loss_clip": 1.06458521, "balance_loss_mlp": 1.03864551, "epoch": 0.04094393506688712, "flos": 27597673514880.0, "grad_norm": 1.713943858995997, "language_loss": 0.83089912, "learning_rate": 3.9987440660660685e-06, "loss": 0.85406095, "num_input_tokens_seen": 14447355, "step": 681, "time_per_iteration": 2.6386382579803467 }, { "auxiliary_loss_clip": 0.01244279, "auxiliary_loss_mlp": 0.01073303, "balance_loss_clip": 1.06438065, "balance_loss_mlp": 1.04127121, "epoch": 0.04100405831955509, "flos": 23112036109440.0, "grad_norm": 1.706698119772261, "language_loss": 0.71538687, "learning_rate": 3.998730228142726e-06, "loss": 0.7385627, "num_input_tokens_seen": 14466790, "step": 682, "time_per_iteration": 2.618792772293091 }, { "auxiliary_loss_clip": 0.01243156, "auxiliary_loss_mlp": 0.01078429, "balance_loss_clip": 1.06440282, "balance_loss_mlp": 1.04781592, "epoch": 0.04106418157222306, "flos": 20156731695360.0, "grad_norm": 1.6947476714586034, "language_loss": 0.72599399, "learning_rate": 3.998716314427333e-06, "loss": 0.74920982, "num_input_tokens_seen": 14485195, "step": 683, "time_per_iteration": 2.676133394241333 }, { "auxiliary_loss_clip": 0.01241071, "auxiliary_loss_mlp": 0.01079531, "balance_loss_clip": 1.07077932, "balance_loss_mlp": 1.04851258, "epoch": 0.041124304824891024, "flos": 17420697855360.0, "grad_norm": 2.098652785935233, "language_loss": 0.81419414, "learning_rate": 3.998702324920417e-06, "loss": 0.8374002, "num_input_tokens_seen": 14503370, "step": 684, "time_per_iteration": 2.6538476943969727 }, { "auxiliary_loss_clip": 0.01242791, "auxiliary_loss_mlp": 0.0107365, "balance_loss_clip": 1.06783867, "balance_loss_mlp": 1.04139185, "epoch": 0.041184428077558996, "flos": 25780163287680.0, "grad_norm": 1.5053911947555274, "language_loss": 0.90680599, "learning_rate": 3.9986882596225085e-06, "loss": 0.92997038, "num_input_tokens_seen": 14526415, "step": 685, "time_per_iteration": 2.6541450023651123 }, { "auxiliary_loss_clip": 0.01244219, "auxiliary_loss_mlp": 0.01072481, "balance_loss_clip": 1.06659365, "balance_loss_mlp": 1.04093838, "epoch": 0.04124455133022697, "flos": 22964766347520.0, "grad_norm": 2.2251875217653185, "language_loss": 0.87851977, "learning_rate": 3.998674118534141e-06, "loss": 0.90168673, "num_input_tokens_seen": 14546595, "step": 686, "time_per_iteration": 2.7298531532287598 }, { "auxiliary_loss_clip": 0.01247476, "auxiliary_loss_mlp": 0.01073385, "balance_loss_clip": 1.06586432, "balance_loss_mlp": 1.04224789, "epoch": 0.04130467458289493, "flos": 21289067015040.0, "grad_norm": 1.8582614005091855, "language_loss": 0.7152915, "learning_rate": 3.998659901655851e-06, "loss": 0.73850012, "num_input_tokens_seen": 14566590, "step": 687, "time_per_iteration": 2.6284232139587402 }, { "auxiliary_loss_clip": 0.01243582, "auxiliary_loss_mlp": 0.01076448, "balance_loss_clip": 1.06979251, "balance_loss_mlp": 1.04756403, "epoch": 0.041364797835562905, "flos": 19974233669760.0, "grad_norm": 2.596672934278983, "language_loss": 0.86028284, "learning_rate": 3.998645608988177e-06, "loss": 0.88348317, "num_input_tokens_seen": 14585965, "step": 688, "time_per_iteration": 2.522634506225586 }, { "auxiliary_loss_clip": 0.01241593, "auxiliary_loss_mlp": 0.01079647, "balance_loss_clip": 1.06802177, "balance_loss_mlp": 1.04908216, "epoch": 0.04142492108823087, "flos": 21906227520000.0, "grad_norm": 2.852238187591699, "language_loss": 0.83393514, "learning_rate": 3.998631240531661e-06, "loss": 0.85714757, "num_input_tokens_seen": 14606015, "step": 689, "time_per_iteration": 2.6140944957733154 }, { "auxiliary_loss_clip": 0.01238254, "auxiliary_loss_mlp": 0.01085009, "balance_loss_clip": 1.06293654, "balance_loss_mlp": 1.05463421, "epoch": 0.04148504434089884, "flos": 27639617621760.0, "grad_norm": 2.870474577544969, "language_loss": 0.68398476, "learning_rate": 3.998616796286848e-06, "loss": 0.70721734, "num_input_tokens_seen": 14629955, "step": 690, "time_per_iteration": 2.658987522125244 }, { "auxiliary_loss_clip": 0.01235903, "auxiliary_loss_mlp": 0.01075275, "balance_loss_clip": 1.0625304, "balance_loss_mlp": 1.04565191, "epoch": 0.041545167593566815, "flos": 20518387781760.0, "grad_norm": 1.634289561889102, "language_loss": 0.74927461, "learning_rate": 3.998602276254286e-06, "loss": 0.77238643, "num_input_tokens_seen": 14648000, "step": 691, "time_per_iteration": 2.599957227706909 }, { "auxiliary_loss_clip": 0.01239089, "auxiliary_loss_mlp": 0.01081705, "balance_loss_clip": 1.06458938, "balance_loss_mlp": 1.04978108, "epoch": 0.04160529084623478, "flos": 11868907939200.0, "grad_norm": 2.123432521314224, "language_loss": 0.84469771, "learning_rate": 3.998587680434526e-06, "loss": 0.86790562, "num_input_tokens_seen": 14662235, "step": 692, "time_per_iteration": 2.5748491287231445 }, { "auxiliary_loss_clip": 0.01242126, "auxiliary_loss_mlp": 0.01076613, "balance_loss_clip": 1.06274796, "balance_loss_mlp": 1.04409313, "epoch": 0.04166541409890275, "flos": 14828306503680.0, "grad_norm": 2.3463094595874665, "language_loss": 0.88948715, "learning_rate": 3.99857300882812e-06, "loss": 0.91267455, "num_input_tokens_seen": 14676065, "step": 693, "time_per_iteration": 2.569277286529541 }, { "auxiliary_loss_clip": 0.01245438, "auxiliary_loss_mlp": 0.01071471, "balance_loss_clip": 1.06845784, "balance_loss_mlp": 1.04123962, "epoch": 0.04172553735157072, "flos": 25808137004160.0, "grad_norm": 5.499777597079252, "language_loss": 0.81987685, "learning_rate": 3.998558261435626e-06, "loss": 0.84304595, "num_input_tokens_seen": 14694955, "step": 694, "time_per_iteration": 2.6798722743988037 }, { "auxiliary_loss_clip": 0.01242101, "auxiliary_loss_mlp": 0.01073692, "balance_loss_clip": 1.06179321, "balance_loss_mlp": 1.04303181, "epoch": 0.04178566060423869, "flos": 24279815174400.0, "grad_norm": 2.051302362473346, "language_loss": 0.83672506, "learning_rate": 3.9985434382576015e-06, "loss": 0.85988301, "num_input_tokens_seen": 14715510, "step": 695, "time_per_iteration": 2.684537649154663 }, { "auxiliary_loss_clip": 0.01242205, "auxiliary_loss_mlp": 0.01080004, "balance_loss_clip": 1.06535804, "balance_loss_mlp": 1.04822254, "epoch": 0.04184578385690666, "flos": 18222008411520.0, "grad_norm": 2.113561459264794, "language_loss": 0.84351176, "learning_rate": 3.99852853929461e-06, "loss": 0.86673379, "num_input_tokens_seen": 14731755, "step": 696, "time_per_iteration": 4.1141321659088135 }, { "auxiliary_loss_clip": 0.01238462, "auxiliary_loss_mlp": 0.01083207, "balance_loss_clip": 1.06265593, "balance_loss_mlp": 1.05099702, "epoch": 0.041905907109574626, "flos": 22776342577920.0, "grad_norm": 6.921460264787684, "language_loss": 0.93193012, "learning_rate": 3.998513564547216e-06, "loss": 0.95514685, "num_input_tokens_seen": 14750810, "step": 697, "time_per_iteration": 5.71666693687439 }, { "auxiliary_loss_clip": 0.01235964, "auxiliary_loss_mlp": 0.01074448, "balance_loss_clip": 1.06324339, "balance_loss_mlp": 1.04495573, "epoch": 0.0419660303622426, "flos": 20156947176960.0, "grad_norm": 2.1002029886241904, "language_loss": 0.83775562, "learning_rate": 3.998498514015987e-06, "loss": 0.86085975, "num_input_tokens_seen": 14768435, "step": 698, "time_per_iteration": 4.194530010223389 }, { "auxiliary_loss_clip": 0.01239177, "auxiliary_loss_mlp": 0.01093516, "balance_loss_clip": 1.06274605, "balance_loss_mlp": 1.06175828, "epoch": 0.042026153614910564, "flos": 23076376882560.0, "grad_norm": 2.1234669437327955, "language_loss": 0.91715962, "learning_rate": 3.998483387701495e-06, "loss": 0.94048655, "num_input_tokens_seen": 14786690, "step": 699, "time_per_iteration": 2.6399078369140625 }, { "auxiliary_loss_clip": 0.01113327, "auxiliary_loss_mlp": 0.0102038, "balance_loss_clip": 1.03020263, "balance_loss_mlp": 1.01403797, "epoch": 0.042086276867578536, "flos": 64495243370880.0, "grad_norm": 0.9035134571641164, "language_loss": 0.67873394, "learning_rate": 3.998468185604312e-06, "loss": 0.70007098, "num_input_tokens_seen": 14853840, "step": 700, "time_per_iteration": 3.192026376724243 }, { "auxiliary_loss_clip": 0.01246765, "auxiliary_loss_mlp": 0.01082955, "balance_loss_clip": 1.06717515, "balance_loss_mlp": 1.05017269, "epoch": 0.04214640012024651, "flos": 15487016065920.0, "grad_norm": 2.2754848646841888, "language_loss": 0.884673, "learning_rate": 3.998452907725016e-06, "loss": 0.90797025, "num_input_tokens_seen": 14869580, "step": 701, "time_per_iteration": 2.5790441036224365 }, { "auxiliary_loss_clip": 0.01242428, "auxiliary_loss_mlp": 0.01080259, "balance_loss_clip": 1.06793952, "balance_loss_mlp": 1.04833448, "epoch": 0.04220652337291447, "flos": 23877040993920.0, "grad_norm": 2.000128536077818, "language_loss": 0.67100394, "learning_rate": 3.998437554064184e-06, "loss": 0.69423079, "num_input_tokens_seen": 14891065, "step": 702, "time_per_iteration": 2.6247870922088623 }, { "auxiliary_loss_clip": 0.01107168, "auxiliary_loss_mlp": 0.01005563, "balance_loss_clip": 1.02512407, "balance_loss_mlp": 0.99922067, "epoch": 0.042266646625582445, "flos": 63795451628160.0, "grad_norm": 0.8439205282656718, "language_loss": 0.60756463, "learning_rate": 3.9984221246224006e-06, "loss": 0.62869191, "num_input_tokens_seen": 14954815, "step": 703, "time_per_iteration": 3.1991655826568604 }, { "auxiliary_loss_clip": 0.01107933, "auxiliary_loss_mlp": 0.01006502, "balance_loss_clip": 1.02562141, "balance_loss_mlp": 0.99973089, "epoch": 0.04232676987825041, "flos": 50018863345920.0, "grad_norm": 1.0471369072250156, "language_loss": 0.57677412, "learning_rate": 3.9984066194002494e-06, "loss": 0.59791845, "num_input_tokens_seen": 15003050, "step": 704, "time_per_iteration": 3.037705659866333 }, { "auxiliary_loss_clip": 0.01241513, "auxiliary_loss_mlp": 0.01072126, "balance_loss_clip": 1.06549489, "balance_loss_mlp": 1.0406549, "epoch": 0.04238689313091838, "flos": 21616105368960.0, "grad_norm": 2.9488804643242488, "language_loss": 0.87553984, "learning_rate": 3.998391038398319e-06, "loss": 0.89867628, "num_input_tokens_seen": 15021990, "step": 705, "time_per_iteration": 2.6233222484588623 }, { "auxiliary_loss_clip": 0.01230342, "auxiliary_loss_mlp": 0.0107194, "balance_loss_clip": 1.0605582, "balance_loss_mlp": 1.04204249, "epoch": 0.042447016383586354, "flos": 19135109070720.0, "grad_norm": 2.556815837902013, "language_loss": 0.71071029, "learning_rate": 3.998375381617201e-06, "loss": 0.73373306, "num_input_tokens_seen": 15040700, "step": 706, "time_per_iteration": 2.560434579849243 }, { "auxiliary_loss_clip": 0.0123412, "auxiliary_loss_mlp": 0.01070349, "balance_loss_clip": 1.06249404, "balance_loss_mlp": 1.03799582, "epoch": 0.04250713963625432, "flos": 24426007528320.0, "grad_norm": 2.0814078632624167, "language_loss": 0.93418455, "learning_rate": 3.9983596490574875e-06, "loss": 0.95722926, "num_input_tokens_seen": 15056725, "step": 707, "time_per_iteration": 2.6130473613739014 }, { "auxiliary_loss_clip": 0.01237541, "auxiliary_loss_mlp": 0.01067908, "balance_loss_clip": 1.05994225, "balance_loss_mlp": 1.03617477, "epoch": 0.04256726288892229, "flos": 30367391333760.0, "grad_norm": 2.424205580643553, "language_loss": 0.81514043, "learning_rate": 3.998343840719776e-06, "loss": 0.83819497, "num_input_tokens_seen": 15077550, "step": 708, "time_per_iteration": 2.656277894973755 }, { "auxiliary_loss_clip": 0.01243932, "auxiliary_loss_mlp": 0.0108167, "balance_loss_clip": 1.06461239, "balance_loss_mlp": 1.04934049, "epoch": 0.04262738614159026, "flos": 16362661818240.0, "grad_norm": 2.0883592727868145, "language_loss": 0.82027614, "learning_rate": 3.998327956604666e-06, "loss": 0.8435322, "num_input_tokens_seen": 15094955, "step": 709, "time_per_iteration": 2.5758891105651855 }, { "auxiliary_loss_clip": 0.01243538, "auxiliary_loss_mlp": 0.01071217, "balance_loss_clip": 1.06374872, "balance_loss_mlp": 1.03960264, "epoch": 0.04268750939425823, "flos": 20412379768320.0, "grad_norm": 2.7686525844665133, "language_loss": 0.8502059, "learning_rate": 3.99831199671276e-06, "loss": 0.87335348, "num_input_tokens_seen": 15113395, "step": 710, "time_per_iteration": 2.571559429168701 }, { "auxiliary_loss_clip": 0.0124498, "auxiliary_loss_mlp": 0.01072229, "balance_loss_clip": 1.06788397, "balance_loss_mlp": 1.04166365, "epoch": 0.0427476326469262, "flos": 20302959962880.0, "grad_norm": 7.911177124524585, "language_loss": 0.84914303, "learning_rate": 3.998295961044662e-06, "loss": 0.87231517, "num_input_tokens_seen": 15132920, "step": 711, "time_per_iteration": 2.569959878921509 }, { "auxiliary_loss_clip": 0.01237769, "auxiliary_loss_mlp": 0.01074338, "balance_loss_clip": 1.06188083, "balance_loss_mlp": 1.04229426, "epoch": 0.042807755899594166, "flos": 21650794928640.0, "grad_norm": 1.7189790042473796, "language_loss": 0.85439789, "learning_rate": 3.9982798496009804e-06, "loss": 0.87751901, "num_input_tokens_seen": 15153115, "step": 712, "time_per_iteration": 2.6200509071350098 }, { "auxiliary_loss_clip": 0.01242397, "auxiliary_loss_mlp": 0.01069523, "balance_loss_clip": 1.06085837, "balance_loss_mlp": 1.03989983, "epoch": 0.04286787915226214, "flos": 21435007973760.0, "grad_norm": 5.490507523621204, "language_loss": 0.91178697, "learning_rate": 3.998263662382328e-06, "loss": 0.93490618, "num_input_tokens_seen": 15172770, "step": 713, "time_per_iteration": 2.6353416442871094 }, { "auxiliary_loss_clip": 0.01104693, "auxiliary_loss_mlp": 0.01006514, "balance_loss_clip": 1.02325606, "balance_loss_mlp": 0.99955195, "epoch": 0.04292800240493011, "flos": 66397970615040.0, "grad_norm": 0.9310328114407391, "language_loss": 0.63725489, "learning_rate": 3.9982473993893165e-06, "loss": 0.65836698, "num_input_tokens_seen": 15240055, "step": 714, "time_per_iteration": 3.2544445991516113 }, { "auxiliary_loss_clip": 0.01239175, "auxiliary_loss_mlp": 0.01085992, "balance_loss_clip": 1.06602359, "balance_loss_mlp": 1.05552244, "epoch": 0.042988125657598075, "flos": 31650264552960.0, "grad_norm": 1.8449858143817996, "language_loss": 0.75010103, "learning_rate": 3.998231060622563e-06, "loss": 0.77335274, "num_input_tokens_seen": 15261585, "step": 715, "time_per_iteration": 2.7048466205596924 }, { "auxiliary_loss_clip": 0.01242734, "auxiliary_loss_mlp": 0.01074126, "balance_loss_clip": 1.0666225, "balance_loss_mlp": 1.04227352, "epoch": 0.04304824891026605, "flos": 33248468292480.0, "grad_norm": 1.9505519101092619, "language_loss": 0.72289199, "learning_rate": 3.998214646082688e-06, "loss": 0.74606061, "num_input_tokens_seen": 15281160, "step": 716, "time_per_iteration": 2.7807397842407227 }, { "auxiliary_loss_clip": 0.01104303, "auxiliary_loss_mlp": 0.01006894, "balance_loss_clip": 1.02277207, "balance_loss_mlp": 0.99997944, "epoch": 0.04310837216293401, "flos": 64064782782720.0, "grad_norm": 0.9245106661639481, "language_loss": 0.65587437, "learning_rate": 3.998198155770314e-06, "loss": 0.67698634, "num_input_tokens_seen": 15344505, "step": 717, "time_per_iteration": 3.250870943069458 }, { "auxiliary_loss_clip": 0.01103971, "auxiliary_loss_mlp": 0.01009587, "balance_loss_clip": 1.02238059, "balance_loss_mlp": 1.00267255, "epoch": 0.043168495415601985, "flos": 61343757849600.0, "grad_norm": 0.9849394627593366, "language_loss": 0.58785796, "learning_rate": 3.998181589686065e-06, "loss": 0.60899353, "num_input_tokens_seen": 15404050, "step": 718, "time_per_iteration": 3.0402464866638184 }, { "auxiliary_loss_clip": 0.0124025, "auxiliary_loss_mlp": 0.0107507, "balance_loss_clip": 1.06784248, "balance_loss_mlp": 1.0424546, "epoch": 0.04322861866826996, "flos": 20704261685760.0, "grad_norm": 1.9557310597444375, "language_loss": 0.91440111, "learning_rate": 3.99816494783057e-06, "loss": 0.9375543, "num_input_tokens_seen": 15424190, "step": 719, "time_per_iteration": 2.6500089168548584 }, { "auxiliary_loss_clip": 0.01235843, "auxiliary_loss_mlp": 0.01072906, "balance_loss_clip": 1.06020999, "balance_loss_mlp": 1.04296041, "epoch": 0.04328874192093792, "flos": 30373352991360.0, "grad_norm": 1.7057721639328365, "language_loss": 0.66461253, "learning_rate": 3.99814823020446e-06, "loss": 0.68770003, "num_input_tokens_seen": 15446500, "step": 720, "time_per_iteration": 2.673184871673584 }, { "auxiliary_loss_clip": 0.01234245, "auxiliary_loss_mlp": 0.01072069, "balance_loss_clip": 1.06111717, "balance_loss_mlp": 1.04131258, "epoch": 0.043348865173605894, "flos": 21944795748480.0, "grad_norm": 1.9491363249287763, "language_loss": 0.77460182, "learning_rate": 3.9981314368083684e-06, "loss": 0.79766488, "num_input_tokens_seen": 15465830, "step": 721, "time_per_iteration": 2.6695611476898193 }, { "auxiliary_loss_clip": 0.01241854, "auxiliary_loss_mlp": 0.01087169, "balance_loss_clip": 1.06622314, "balance_loss_mlp": 1.05719972, "epoch": 0.04340898842627386, "flos": 15264225959040.0, "grad_norm": 2.8383174670702718, "language_loss": 0.88298881, "learning_rate": 3.998114567642933e-06, "loss": 0.90627909, "num_input_tokens_seen": 15479985, "step": 722, "time_per_iteration": 2.661313533782959 }, { "auxiliary_loss_clip": 0.01244836, "auxiliary_loss_mlp": 0.01076885, "balance_loss_clip": 1.06665182, "balance_loss_mlp": 1.0480125, "epoch": 0.04346911167894183, "flos": 27965434913280.0, "grad_norm": 5.515838365549148, "language_loss": 0.84387141, "learning_rate": 3.998097622708792e-06, "loss": 0.86708868, "num_input_tokens_seen": 15501545, "step": 723, "time_per_iteration": 2.6447954177856445 }, { "auxiliary_loss_clip": 0.01245825, "auxiliary_loss_mlp": 0.01081354, "balance_loss_clip": 1.06723523, "balance_loss_mlp": 1.05019248, "epoch": 0.0435292349316098, "flos": 29242202820480.0, "grad_norm": 1.7852936089408447, "language_loss": 0.82789439, "learning_rate": 3.99808060200659e-06, "loss": 0.85116619, "num_input_tokens_seen": 15521725, "step": 724, "time_per_iteration": 2.676985263824463 }, { "auxiliary_loss_clip": 0.0124127, "auxiliary_loss_mlp": 0.01087491, "balance_loss_clip": 1.06535757, "balance_loss_mlp": 1.05609179, "epoch": 0.04358935818427777, "flos": 20558356640640.0, "grad_norm": 2.011685360503238, "language_loss": 0.79444051, "learning_rate": 3.998063505536971e-06, "loss": 0.81772816, "num_input_tokens_seen": 15540910, "step": 725, "time_per_iteration": 2.6241447925567627 }, { "auxiliary_loss_clip": 0.01251777, "auxiliary_loss_mlp": 0.01074923, "balance_loss_clip": 1.06783843, "balance_loss_mlp": 1.04309392, "epoch": 0.04364948143694574, "flos": 14464926564480.0, "grad_norm": 2.2160842755462817, "language_loss": 0.87175703, "learning_rate": 3.998046333300584e-06, "loss": 0.89502406, "num_input_tokens_seen": 15558640, "step": 726, "time_per_iteration": 2.555551052093506 }, { "auxiliary_loss_clip": 0.01100917, "auxiliary_loss_mlp": 0.01015411, "balance_loss_clip": 1.02171838, "balance_loss_mlp": 1.00947404, "epoch": 0.043709604689613706, "flos": 50067268922880.0, "grad_norm": 0.908981905466007, "language_loss": 0.55868411, "learning_rate": 3.998029085298079e-06, "loss": 0.5798474, "num_input_tokens_seen": 15612975, "step": 727, "time_per_iteration": 3.375901699066162 }, { "auxiliary_loss_clip": 0.01245647, "auxiliary_loss_mlp": 0.0108809, "balance_loss_clip": 1.06717396, "balance_loss_mlp": 1.05614173, "epoch": 0.04376972794228168, "flos": 13991588115840.0, "grad_norm": 2.282663852625415, "language_loss": 0.82326066, "learning_rate": 3.998011761530112e-06, "loss": 0.84659809, "num_input_tokens_seen": 15631070, "step": 728, "time_per_iteration": 2.605970621109009 }, { "auxiliary_loss_clip": 0.01237902, "auxiliary_loss_mlp": 0.01073495, "balance_loss_clip": 1.06600416, "balance_loss_mlp": 1.04321551, "epoch": 0.04382985119494965, "flos": 22009901149440.0, "grad_norm": 2.1303486954703152, "language_loss": 0.76890069, "learning_rate": 3.997994361997338e-06, "loss": 0.7920146, "num_input_tokens_seen": 15647825, "step": 729, "time_per_iteration": 2.652466297149658 }, { "auxiliary_loss_clip": 0.01243746, "auxiliary_loss_mlp": 0.01079207, "balance_loss_clip": 1.06438255, "balance_loss_mlp": 1.04859376, "epoch": 0.043889974447617615, "flos": 24206521472640.0, "grad_norm": 2.1385115795714107, "language_loss": 0.95153189, "learning_rate": 3.997976886700417e-06, "loss": 0.97476137, "num_input_tokens_seen": 15668260, "step": 730, "time_per_iteration": 2.734614133834839 }, { "auxiliary_loss_clip": 0.01238581, "auxiliary_loss_mlp": 0.01074727, "balance_loss_clip": 1.06093788, "balance_loss_mlp": 1.04315984, "epoch": 0.04395009770028559, "flos": 17274541415040.0, "grad_norm": 2.333073864238008, "language_loss": 0.88456279, "learning_rate": 3.997959335640013e-06, "loss": 0.90769589, "num_input_tokens_seen": 15685630, "step": 731, "time_per_iteration": 2.5912294387817383 }, { "auxiliary_loss_clip": 0.01242247, "auxiliary_loss_mlp": 0.01076563, "balance_loss_clip": 1.06636512, "balance_loss_mlp": 1.04757094, "epoch": 0.04401022095295355, "flos": 12310286261760.0, "grad_norm": 3.0398759554531254, "language_loss": 0.88683128, "learning_rate": 3.997941708816791e-06, "loss": 0.9100194, "num_input_tokens_seen": 15698645, "step": 732, "time_per_iteration": 2.5897367000579834 }, { "auxiliary_loss_clip": 0.01242736, "auxiliary_loss_mlp": 0.01087795, "balance_loss_clip": 1.06544232, "balance_loss_mlp": 1.05646718, "epoch": 0.044070344205621524, "flos": 20959658363520.0, "grad_norm": 2.304959545118842, "language_loss": 0.85829747, "learning_rate": 3.997924006231419e-06, "loss": 0.88160276, "num_input_tokens_seen": 15716775, "step": 733, "time_per_iteration": 2.650681972503662 }, { "auxiliary_loss_clip": 0.01246603, "auxiliary_loss_mlp": 0.01088724, "balance_loss_clip": 1.06722379, "balance_loss_mlp": 1.05544066, "epoch": 0.044130467458289496, "flos": 13845288021120.0, "grad_norm": 2.207780377909299, "language_loss": 0.91189414, "learning_rate": 3.9979062278845685e-06, "loss": 0.93524742, "num_input_tokens_seen": 15733320, "step": 734, "time_per_iteration": 2.5956180095672607 }, { "auxiliary_loss_clip": 0.01238395, "auxiliary_loss_mlp": 0.01067579, "balance_loss_clip": 1.06596422, "balance_loss_mlp": 1.03781235, "epoch": 0.04419059071095746, "flos": 28655063107200.0, "grad_norm": 1.9297536072777384, "language_loss": 0.77884138, "learning_rate": 3.9978883737769125e-06, "loss": 0.8019011, "num_input_tokens_seen": 15752705, "step": 735, "time_per_iteration": 2.603809118270874 }, { "auxiliary_loss_clip": 0.01234188, "auxiliary_loss_mlp": 0.01070499, "balance_loss_clip": 1.06063068, "balance_loss_mlp": 1.04091144, "epoch": 0.04425071396362543, "flos": 28183304856960.0, "grad_norm": 2.266122200005257, "language_loss": 0.8832593, "learning_rate": 3.9978704439091305e-06, "loss": 0.90630615, "num_input_tokens_seen": 15772800, "step": 736, "time_per_iteration": 5.841086149215698 }, { "auxiliary_loss_clip": 0.01235947, "auxiliary_loss_mlp": 0.01081098, "balance_loss_clip": 1.06597185, "balance_loss_mlp": 1.05165362, "epoch": 0.0443108372162934, "flos": 23658452778240.0, "grad_norm": 1.8984177574034653, "language_loss": 0.84481263, "learning_rate": 3.997852438281901e-06, "loss": 0.8679831, "num_input_tokens_seen": 15793665, "step": 737, "time_per_iteration": 4.1386003494262695 }, { "auxiliary_loss_clip": 0.01240863, "auxiliary_loss_mlp": 0.01072388, "balance_loss_clip": 1.0653491, "balance_loss_mlp": 1.03961766, "epoch": 0.04437096046896137, "flos": 33979861025280.0, "grad_norm": 2.2366199062134706, "language_loss": 0.84712577, "learning_rate": 3.997834356895906e-06, "loss": 0.87025833, "num_input_tokens_seen": 15813175, "step": 738, "time_per_iteration": 4.447159290313721 }, { "auxiliary_loss_clip": 0.01098733, "auxiliary_loss_mlp": 0.0102196, "balance_loss_clip": 1.02144337, "balance_loss_mlp": 1.01685739, "epoch": 0.04443108372162934, "flos": 67397506375680.0, "grad_norm": 0.8779518557387592, "language_loss": 0.59179878, "learning_rate": 3.9978161997518324e-06, "loss": 0.61300576, "num_input_tokens_seen": 15872050, "step": 739, "time_per_iteration": 3.0780396461486816 }, { "auxiliary_loss_clip": 0.012386, "auxiliary_loss_mlp": 0.01067387, "balance_loss_clip": 1.06604302, "balance_loss_mlp": 1.03717899, "epoch": 0.04449120697429731, "flos": 29752672953600.0, "grad_norm": 2.295102845773205, "language_loss": 0.91329807, "learning_rate": 3.997797966850369e-06, "loss": 0.93635798, "num_input_tokens_seen": 15891085, "step": 740, "time_per_iteration": 2.6687562465667725 }, { "auxiliary_loss_clip": 0.01243424, "auxiliary_loss_mlp": 0.01067832, "balance_loss_clip": 1.06807768, "balance_loss_mlp": 1.03929377, "epoch": 0.04455133022696528, "flos": 36502119072000.0, "grad_norm": 2.0543845689042484, "language_loss": 0.71875739, "learning_rate": 3.997779658192205e-06, "loss": 0.74186987, "num_input_tokens_seen": 15914225, "step": 741, "time_per_iteration": 2.707231283187866 }, { "auxiliary_loss_clip": 0.01233192, "auxiliary_loss_mlp": 0.01084138, "balance_loss_clip": 1.062482, "balance_loss_mlp": 1.05476475, "epoch": 0.044611453479633245, "flos": 28803661672320.0, "grad_norm": 1.7086571433899975, "language_loss": 0.88933527, "learning_rate": 3.997761273778037e-06, "loss": 0.91250861, "num_input_tokens_seen": 15934540, "step": 742, "time_per_iteration": 2.6647751331329346 }, { "auxiliary_loss_clip": 0.01237248, "auxiliary_loss_mlp": 0.0106534, "balance_loss_clip": 1.06481838, "balance_loss_mlp": 1.03367805, "epoch": 0.04467157673230122, "flos": 20010970304640.0, "grad_norm": 1.9055071619943689, "language_loss": 0.83840811, "learning_rate": 3.997742813608561e-06, "loss": 0.86143398, "num_input_tokens_seen": 15952560, "step": 743, "time_per_iteration": 2.697864055633545 }, { "auxiliary_loss_clip": 0.01239398, "auxiliary_loss_mlp": 0.01073846, "balance_loss_clip": 1.06395566, "balance_loss_mlp": 1.04373407, "epoch": 0.04473169998496919, "flos": 18004964480640.0, "grad_norm": 2.2041873634107696, "language_loss": 0.80026019, "learning_rate": 3.997724277684479e-06, "loss": 0.82339263, "num_input_tokens_seen": 15970620, "step": 744, "time_per_iteration": 2.6551101207733154 }, { "auxiliary_loss_clip": 0.01236158, "auxiliary_loss_mlp": 0.01076186, "balance_loss_clip": 1.06385589, "balance_loss_mlp": 1.04665816, "epoch": 0.044791823237637154, "flos": 20631722169600.0, "grad_norm": 2.139129927663487, "language_loss": 0.85502481, "learning_rate": 3.99770566600649e-06, "loss": 0.87814826, "num_input_tokens_seen": 15987325, "step": 745, "time_per_iteration": 2.6686010360717773 }, { "auxiliary_loss_clip": 0.01235001, "auxiliary_loss_mlp": 0.01066107, "balance_loss_clip": 1.06320596, "balance_loss_mlp": 1.03594685, "epoch": 0.04485194649030513, "flos": 31176171918720.0, "grad_norm": 1.8251828520192552, "language_loss": 0.69291008, "learning_rate": 3.997686978575302e-06, "loss": 0.71592116, "num_input_tokens_seen": 16008310, "step": 746, "time_per_iteration": 2.6782095432281494 }, { "auxiliary_loss_clip": 0.01244022, "auxiliary_loss_mlp": 0.01081644, "balance_loss_clip": 1.07012939, "balance_loss_mlp": 1.05000615, "epoch": 0.04491206974297309, "flos": 26143291831680.0, "grad_norm": 3.6053643469900982, "language_loss": 0.68531066, "learning_rate": 3.997668215391625e-06, "loss": 0.70856726, "num_input_tokens_seen": 16029620, "step": 747, "time_per_iteration": 2.6589114665985107 }, { "auxiliary_loss_clip": 0.0124018, "auxiliary_loss_mlp": 0.01083594, "balance_loss_clip": 1.0652504, "balance_loss_mlp": 1.05183625, "epoch": 0.044972192995641064, "flos": 20667668705280.0, "grad_norm": 1.8376208182131786, "language_loss": 0.66778374, "learning_rate": 3.997649376456168e-06, "loss": 0.69102144, "num_input_tokens_seen": 16049065, "step": 748, "time_per_iteration": 2.674691677093506 }, { "auxiliary_loss_clip": 0.01243343, "auxiliary_loss_mlp": 0.01085665, "balance_loss_clip": 1.07101417, "balance_loss_mlp": 1.05596995, "epoch": 0.045032316248309036, "flos": 16106834177280.0, "grad_norm": 2.4197486882062322, "language_loss": 0.76684916, "learning_rate": 3.997630461769647e-06, "loss": 0.7901392, "num_input_tokens_seen": 16066765, "step": 749, "time_per_iteration": 2.5940611362457275 }, { "auxiliary_loss_clip": 0.01243381, "auxiliary_loss_mlp": 0.01083303, "balance_loss_clip": 1.06892776, "balance_loss_mlp": 1.05338168, "epoch": 0.045092439500977, "flos": 17858843953920.0, "grad_norm": 1.926675828378473, "language_loss": 0.88739896, "learning_rate": 3.997611471332778e-06, "loss": 0.91066581, "num_input_tokens_seen": 16085980, "step": 750, "time_per_iteration": 2.551717758178711 }, { "auxiliary_loss_clip": 0.01238484, "auxiliary_loss_mlp": 0.01077419, "balance_loss_clip": 1.062783, "balance_loss_mlp": 1.04404092, "epoch": 0.04515256275364497, "flos": 24462815990400.0, "grad_norm": 3.4910287963746116, "language_loss": 0.74371743, "learning_rate": 3.9975924051462825e-06, "loss": 0.76687646, "num_input_tokens_seen": 16106260, "step": 751, "time_per_iteration": 2.6299028396606445 }, { "auxiliary_loss_clip": 0.0123577, "auxiliary_loss_mlp": 0.01078322, "balance_loss_clip": 1.06347609, "balance_loss_mlp": 1.04884171, "epoch": 0.04521268600631294, "flos": 20916385453440.0, "grad_norm": 3.3938056459605583, "language_loss": 0.69115144, "learning_rate": 3.997573263210883e-06, "loss": 0.71429229, "num_input_tokens_seen": 16123475, "step": 752, "time_per_iteration": 2.571223020553589 }, { "auxiliary_loss_clip": 0.01235899, "auxiliary_loss_mlp": 0.01060876, "balance_loss_clip": 1.0627141, "balance_loss_mlp": 1.03212225, "epoch": 0.04527280925898091, "flos": 13371374954880.0, "grad_norm": 2.69328062598792, "language_loss": 0.92126763, "learning_rate": 3.997554045527305e-06, "loss": 0.94423538, "num_input_tokens_seen": 16138335, "step": 753, "time_per_iteration": 2.6100237369537354 }, { "auxiliary_loss_clip": 0.01239023, "auxiliary_loss_mlp": 0.01080271, "balance_loss_clip": 1.06628633, "balance_loss_mlp": 1.05116034, "epoch": 0.04533293251164888, "flos": 23254565276160.0, "grad_norm": 4.138305317267875, "language_loss": 0.91373456, "learning_rate": 3.997534752096277e-06, "loss": 0.93692756, "num_input_tokens_seen": 16157110, "step": 754, "time_per_iteration": 2.642747402191162 }, { "auxiliary_loss_clip": 0.01229195, "auxiliary_loss_mlp": 0.01078016, "balance_loss_clip": 1.06402516, "balance_loss_mlp": 1.04725957, "epoch": 0.04539305576431685, "flos": 12422004537600.0, "grad_norm": 4.559941934311277, "language_loss": 0.78558046, "learning_rate": 3.997515382918531e-06, "loss": 0.80865264, "num_input_tokens_seen": 16174155, "step": 755, "time_per_iteration": 2.6316659450531006 }, { "auxiliary_loss_clip": 0.01240044, "auxiliary_loss_mlp": 0.01081048, "balance_loss_clip": 1.06624937, "balance_loss_mlp": 1.05099559, "epoch": 0.04545317901698482, "flos": 16070995382400.0, "grad_norm": 2.193539224658874, "language_loss": 0.78473848, "learning_rate": 3.9974959379948015e-06, "loss": 0.80794942, "num_input_tokens_seen": 16192240, "step": 756, "time_per_iteration": 2.6390748023986816 }, { "auxiliary_loss_clip": 0.01101224, "auxiliary_loss_mlp": 0.01013849, "balance_loss_clip": 1.02455997, "balance_loss_mlp": 1.0089612, "epoch": 0.045513302269652785, "flos": 66396139021440.0, "grad_norm": 0.8202876780471967, "language_loss": 0.62756521, "learning_rate": 3.997476417325827e-06, "loss": 0.64871597, "num_input_tokens_seen": 16255775, "step": 757, "time_per_iteration": 3.2393198013305664 }, { "auxiliary_loss_clip": 0.01235136, "auxiliary_loss_mlp": 0.01071767, "balance_loss_clip": 1.06455243, "balance_loss_mlp": 1.04346693, "epoch": 0.04557342552232076, "flos": 21471169991040.0, "grad_norm": 1.6528285304744148, "language_loss": 0.84211069, "learning_rate": 3.997456820912346e-06, "loss": 0.86517978, "num_input_tokens_seen": 16277015, "step": 758, "time_per_iteration": 2.6508655548095703 }, { "auxiliary_loss_clip": 0.01228461, "auxiliary_loss_mlp": 0.01067033, "balance_loss_clip": 1.05912399, "balance_loss_mlp": 1.0391618, "epoch": 0.04563354877498873, "flos": 23732680233600.0, "grad_norm": 2.695805662282291, "language_loss": 0.88150775, "learning_rate": 3.997437148755101e-06, "loss": 0.9044627, "num_input_tokens_seen": 16296005, "step": 759, "time_per_iteration": 2.7782890796661377 }, { "auxiliary_loss_clip": 0.01240589, "auxiliary_loss_mlp": 0.01078815, "balance_loss_clip": 1.06747675, "balance_loss_mlp": 1.04846466, "epoch": 0.045693672027656694, "flos": 25735741142400.0, "grad_norm": 2.392455009776849, "language_loss": 0.73440695, "learning_rate": 3.9974174008548405e-06, "loss": 0.75760102, "num_input_tokens_seen": 16315300, "step": 760, "time_per_iteration": 2.7138822078704834 }, { "auxiliary_loss_clip": 0.01240372, "auxiliary_loss_mlp": 0.01079791, "balance_loss_clip": 1.07095265, "balance_loss_mlp": 1.05162191, "epoch": 0.045753795280324666, "flos": 19719016560000.0, "grad_norm": 3.497321311688565, "language_loss": 0.81781888, "learning_rate": 3.9973975772123105e-06, "loss": 0.84102058, "num_input_tokens_seen": 16333820, "step": 761, "time_per_iteration": 2.631303310394287 }, { "auxiliary_loss_clip": 0.01231969, "auxiliary_loss_mlp": 0.01078623, "balance_loss_clip": 1.06324267, "balance_loss_mlp": 1.04922605, "epoch": 0.04581391853299264, "flos": 23255786338560.0, "grad_norm": 2.0632320043111965, "language_loss": 0.79811668, "learning_rate": 3.997377677828266e-06, "loss": 0.82122266, "num_input_tokens_seen": 16355290, "step": 762, "time_per_iteration": 2.646928071975708 }, { "auxiliary_loss_clip": 0.01093869, "auxiliary_loss_mlp": 0.01027943, "balance_loss_clip": 1.01857328, "balance_loss_mlp": 1.02288842, "epoch": 0.0458740417856606, "flos": 64231155601920.0, "grad_norm": 1.0128965743658471, "language_loss": 0.58723813, "learning_rate": 3.9973577027034585e-06, "loss": 0.60845619, "num_input_tokens_seen": 16415995, "step": 763, "time_per_iteration": 3.1712563037872314 }, { "auxiliary_loss_clip": 0.012343, "auxiliary_loss_mlp": 0.01082461, "balance_loss_clip": 1.06205368, "balance_loss_mlp": 1.0531354, "epoch": 0.045934165038328575, "flos": 20770121272320.0, "grad_norm": 4.978761831483118, "language_loss": 0.87544954, "learning_rate": 3.9973376518386475e-06, "loss": 0.89861715, "num_input_tokens_seen": 16433120, "step": 764, "time_per_iteration": 2.5985426902770996 }, { "auxiliary_loss_clip": 0.01236145, "auxiliary_loss_mlp": 0.01087868, "balance_loss_clip": 1.06553543, "balance_loss_mlp": 1.05854285, "epoch": 0.04599428829099654, "flos": 30262891691520.0, "grad_norm": 2.0894169515773067, "language_loss": 0.85966802, "learning_rate": 3.997317525234592e-06, "loss": 0.88290817, "num_input_tokens_seen": 16453360, "step": 765, "time_per_iteration": 2.6572606563568115 }, { "auxiliary_loss_clip": 0.01239644, "auxiliary_loss_mlp": 0.01077398, "balance_loss_clip": 1.06530261, "balance_loss_mlp": 1.04573584, "epoch": 0.04605441154366451, "flos": 23038921975680.0, "grad_norm": 2.628046285830335, "language_loss": 0.88265938, "learning_rate": 3.997297322892056e-06, "loss": 0.90582979, "num_input_tokens_seen": 16471160, "step": 766, "time_per_iteration": 2.673226833343506 }, { "auxiliary_loss_clip": 0.01235506, "auxiliary_loss_mlp": 0.0107998, "balance_loss_clip": 1.06371713, "balance_loss_mlp": 1.05115545, "epoch": 0.046114534796332485, "flos": 22017407091840.0, "grad_norm": 2.343908591401411, "language_loss": 0.84302223, "learning_rate": 3.997277044811806e-06, "loss": 0.86617708, "num_input_tokens_seen": 16488940, "step": 767, "time_per_iteration": 2.683429002761841 }, { "auxiliary_loss_clip": 0.01236229, "auxiliary_loss_mlp": 0.01067844, "balance_loss_clip": 1.06769753, "balance_loss_mlp": 1.03791094, "epoch": 0.04617465804900045, "flos": 29862380067840.0, "grad_norm": 1.9268984031305718, "language_loss": 0.8669976, "learning_rate": 3.99725669099461e-06, "loss": 0.89003831, "num_input_tokens_seen": 16509505, "step": 768, "time_per_iteration": 2.8125200271606445 }, { "auxiliary_loss_clip": 0.01234175, "auxiliary_loss_mlp": 0.01076069, "balance_loss_clip": 1.06150854, "balance_loss_mlp": 1.04738712, "epoch": 0.04623478130166842, "flos": 25630056351360.0, "grad_norm": 2.115272554881108, "language_loss": 0.75152099, "learning_rate": 3.9972362614412395e-06, "loss": 0.77462339, "num_input_tokens_seen": 16528840, "step": 769, "time_per_iteration": 2.7286128997802734 }, { "auxiliary_loss_clip": 0.01229956, "auxiliary_loss_mlp": 0.01072391, "balance_loss_clip": 1.06326365, "balance_loss_mlp": 1.04462695, "epoch": 0.04629490455433639, "flos": 20449080489600.0, "grad_norm": 1.8368669953292174, "language_loss": 0.86292851, "learning_rate": 3.997215756152471e-06, "loss": 0.885952, "num_input_tokens_seen": 16548335, "step": 770, "time_per_iteration": 2.68608021736145 }, { "auxiliary_loss_clip": 0.01239009, "auxiliary_loss_mlp": 0.01072125, "balance_loss_clip": 1.06274092, "balance_loss_mlp": 1.04284704, "epoch": 0.04635502780700436, "flos": 23148736830720.0, "grad_norm": 2.058802627607224, "language_loss": 0.86842889, "learning_rate": 3.99719517512908e-06, "loss": 0.89154023, "num_input_tokens_seen": 16567725, "step": 771, "time_per_iteration": 2.637509822845459 }, { "auxiliary_loss_clip": 0.01239449, "auxiliary_loss_mlp": 0.01079651, "balance_loss_clip": 1.06184912, "balance_loss_mlp": 1.04884768, "epoch": 0.04641515105967233, "flos": 23292020183040.0, "grad_norm": 1.87920888608735, "language_loss": 0.83691382, "learning_rate": 3.997174518371848e-06, "loss": 0.8601048, "num_input_tokens_seen": 16588175, "step": 772, "time_per_iteration": 2.745006561279297 }, { "auxiliary_loss_clip": 0.01236322, "auxiliary_loss_mlp": 0.0107061, "balance_loss_clip": 1.06672883, "balance_loss_mlp": 1.04220271, "epoch": 0.046475274312340296, "flos": 25115204759040.0, "grad_norm": 1.9655107083336736, "language_loss": 0.73639083, "learning_rate": 3.997153785881557e-06, "loss": 0.75946015, "num_input_tokens_seen": 16607735, "step": 773, "time_per_iteration": 2.869290828704834 }, { "auxiliary_loss_clip": 0.01231219, "auxiliary_loss_mlp": 0.01071681, "balance_loss_clip": 1.06529772, "balance_loss_mlp": 1.04054356, "epoch": 0.04653539756500827, "flos": 25264916645760.0, "grad_norm": 2.096431798380756, "language_loss": 0.78228974, "learning_rate": 3.997132977658996e-06, "loss": 0.80531871, "num_input_tokens_seen": 16627225, "step": 774, "time_per_iteration": 2.6967568397521973 }, { "auxiliary_loss_clip": 0.01230587, "auxiliary_loss_mlp": 0.01069519, "balance_loss_clip": 1.06347871, "balance_loss_mlp": 1.04131365, "epoch": 0.046595520817676234, "flos": 35404150089600.0, "grad_norm": 2.018140205527256, "language_loss": 0.73187691, "learning_rate": 3.997112093704952e-06, "loss": 0.75487792, "num_input_tokens_seen": 16647785, "step": 775, "time_per_iteration": 2.737140417098999 }, { "auxiliary_loss_clip": 0.01231996, "auxiliary_loss_mlp": 0.01066454, "balance_loss_clip": 1.06187618, "balance_loss_mlp": 1.03650832, "epoch": 0.046655644070344206, "flos": 18112516778880.0, "grad_norm": 1.668093168561758, "language_loss": 0.77180624, "learning_rate": 3.997091134020217e-06, "loss": 0.7947908, "num_input_tokens_seen": 16667555, "step": 776, "time_per_iteration": 4.154085159301758 }, { "auxiliary_loss_clip": 0.0122577, "auxiliary_loss_mlp": 0.01071334, "balance_loss_clip": 1.06031108, "balance_loss_mlp": 1.04352236, "epoch": 0.04671576732301218, "flos": 29205286617600.0, "grad_norm": 1.9054628166827923, "language_loss": 0.7087816, "learning_rate": 3.997070098605585e-06, "loss": 0.73175263, "num_input_tokens_seen": 16686875, "step": 777, "time_per_iteration": 4.176887512207031 }, { "auxiliary_loss_clip": 0.0122979, "auxiliary_loss_mlp": 0.01076806, "balance_loss_clip": 1.06275606, "balance_loss_mlp": 1.04705119, "epoch": 0.04677589057568014, "flos": 30478319510400.0, "grad_norm": 1.8083238359854679, "language_loss": 0.77069759, "learning_rate": 3.997048987461856e-06, "loss": 0.79376352, "num_input_tokens_seen": 16706420, "step": 778, "time_per_iteration": 5.943394422531128 }, { "auxiliary_loss_clip": 0.01227067, "auxiliary_loss_mlp": 0.01064982, "balance_loss_clip": 1.06043744, "balance_loss_mlp": 1.03563297, "epoch": 0.046836013828348115, "flos": 20557674282240.0, "grad_norm": 2.1737778598926463, "language_loss": 0.79181123, "learning_rate": 3.997027800589829e-06, "loss": 0.81473172, "num_input_tokens_seen": 16726390, "step": 779, "time_per_iteration": 2.611804485321045 }, { "auxiliary_loss_clip": 0.01219629, "auxiliary_loss_mlp": 0.01070238, "balance_loss_clip": 1.05842376, "balance_loss_mlp": 1.04271269, "epoch": 0.04689613708101608, "flos": 25447378757760.0, "grad_norm": 1.888854926622149, "language_loss": 0.77364886, "learning_rate": 3.997006537990308e-06, "loss": 0.79654753, "num_input_tokens_seen": 16748965, "step": 780, "time_per_iteration": 2.668239116668701 }, { "auxiliary_loss_clip": 0.012253, "auxiliary_loss_mlp": 0.01073321, "balance_loss_clip": 1.06098521, "balance_loss_mlp": 1.04605746, "epoch": 0.04695626033368405, "flos": 23001395241600.0, "grad_norm": 1.7616538282563206, "language_loss": 0.76700419, "learning_rate": 3.996985199664099e-06, "loss": 0.78999043, "num_input_tokens_seen": 16768620, "step": 781, "time_per_iteration": 2.5979926586151123 }, { "auxiliary_loss_clip": 0.01236637, "auxiliary_loss_mlp": 0.01077479, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 1.04836786, "epoch": 0.047016383586352024, "flos": 29133357632640.0, "grad_norm": 3.0946494667490856, "language_loss": 0.73786414, "learning_rate": 3.99696378561201e-06, "loss": 0.76100528, "num_input_tokens_seen": 16789755, "step": 782, "time_per_iteration": 2.708855390548706 }, { "auxiliary_loss_clip": 0.0122968, "auxiliary_loss_mlp": 0.01069368, "balance_loss_clip": 1.06431556, "balance_loss_mlp": 1.04253423, "epoch": 0.04707650683901999, "flos": 14976330451200.0, "grad_norm": 2.1459158015790183, "language_loss": 0.80524659, "learning_rate": 3.996942295834855e-06, "loss": 0.82823706, "num_input_tokens_seen": 16807585, "step": 783, "time_per_iteration": 2.6355738639831543 }, { "auxiliary_loss_clip": 0.01222415, "auxiliary_loss_mlp": 0.01063155, "balance_loss_clip": 1.06221437, "balance_loss_mlp": 1.03663135, "epoch": 0.04713663009168796, "flos": 21651118151040.0, "grad_norm": 1.9084512066318515, "language_loss": 0.81687874, "learning_rate": 3.996920730333448e-06, "loss": 0.83973444, "num_input_tokens_seen": 16827220, "step": 784, "time_per_iteration": 2.64365291595459 }, { "auxiliary_loss_clip": 0.01226632, "auxiliary_loss_mlp": 0.01074549, "balance_loss_clip": 1.0582943, "balance_loss_mlp": 1.04719007, "epoch": 0.04719675334435593, "flos": 21325408600320.0, "grad_norm": 3.970707764370453, "language_loss": 0.80619848, "learning_rate": 3.996899089108607e-06, "loss": 0.82921028, "num_input_tokens_seen": 16846230, "step": 785, "time_per_iteration": 2.682971715927124 }, { "auxiliary_loss_clip": 0.01231621, "auxiliary_loss_mlp": 0.01063774, "balance_loss_clip": 1.06683421, "balance_loss_mlp": 1.03784585, "epoch": 0.0472568765970239, "flos": 17931383470080.0, "grad_norm": 2.074448818096939, "language_loss": 0.89784658, "learning_rate": 3.996877372161152e-06, "loss": 0.92080051, "num_input_tokens_seen": 16865325, "step": 786, "time_per_iteration": 2.6072235107421875 }, { "auxiliary_loss_clip": 0.01227201, "auxiliary_loss_mlp": 0.01069453, "balance_loss_clip": 1.05475712, "balance_loss_mlp": 1.03912568, "epoch": 0.04731699984969187, "flos": 18077324428800.0, "grad_norm": 6.783818284100465, "language_loss": 0.76794451, "learning_rate": 3.9968555794919065e-06, "loss": 0.79091108, "num_input_tokens_seen": 16882930, "step": 787, "time_per_iteration": 2.595069646835327 }, { "auxiliary_loss_clip": 0.01233526, "auxiliary_loss_mlp": 0.01070856, "balance_loss_clip": 1.06563127, "balance_loss_mlp": 1.04248405, "epoch": 0.047377123102359836, "flos": 23185078416000.0, "grad_norm": 2.309745026689568, "language_loss": 0.81301165, "learning_rate": 3.996833711101698e-06, "loss": 0.83605546, "num_input_tokens_seen": 16900710, "step": 788, "time_per_iteration": 2.633812427520752 }, { "auxiliary_loss_clip": 0.01225447, "auxiliary_loss_mlp": 0.01078934, "balance_loss_clip": 1.06370282, "balance_loss_mlp": 1.04934621, "epoch": 0.04743724635502781, "flos": 22747794243840.0, "grad_norm": 2.941245147417381, "language_loss": 0.84428835, "learning_rate": 3.996811766991355e-06, "loss": 0.86733222, "num_input_tokens_seen": 16919210, "step": 789, "time_per_iteration": 2.6711082458496094 }, { "auxiliary_loss_clip": 0.01230866, "auxiliary_loss_mlp": 0.01071483, "balance_loss_clip": 1.06367648, "balance_loss_mlp": 1.0441606, "epoch": 0.04749736960769577, "flos": 17238702620160.0, "grad_norm": 2.0289407228390615, "language_loss": 0.81787878, "learning_rate": 3.996789747161709e-06, "loss": 0.84090227, "num_input_tokens_seen": 16937125, "step": 790, "time_per_iteration": 2.6136717796325684 }, { "auxiliary_loss_clip": 0.01224033, "auxiliary_loss_mlp": 0.01064065, "balance_loss_clip": 1.05880189, "balance_loss_mlp": 1.03546715, "epoch": 0.047557492860363745, "flos": 40479261592320.0, "grad_norm": 2.9735437778568965, "language_loss": 0.88116109, "learning_rate": 3.996767651613597e-06, "loss": 0.90404207, "num_input_tokens_seen": 16958610, "step": 791, "time_per_iteration": 2.747586727142334 }, { "auxiliary_loss_clip": 0.01226267, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.06144643, "balance_loss_mlp": 1.03743124, "epoch": 0.04761761611303172, "flos": 18698004466560.0, "grad_norm": 2.1239226540804537, "language_loss": 0.90671498, "learning_rate": 3.996745480347854e-06, "loss": 0.92964232, "num_input_tokens_seen": 16977300, "step": 792, "time_per_iteration": 2.591477870941162 }, { "auxiliary_loss_clip": 0.01226882, "auxiliary_loss_mlp": 0.0107926, "balance_loss_clip": 1.05968022, "balance_loss_mlp": 1.05225897, "epoch": 0.04767773936569968, "flos": 20921987975040.0, "grad_norm": 1.9120988315570397, "language_loss": 0.73246223, "learning_rate": 3.996723233365324e-06, "loss": 0.75552362, "num_input_tokens_seen": 16994950, "step": 793, "time_per_iteration": 2.6319899559020996 }, { "auxiliary_loss_clip": 0.01231301, "auxiliary_loss_mlp": 0.01070716, "balance_loss_clip": 1.06213653, "balance_loss_mlp": 1.04146254, "epoch": 0.047737862618367655, "flos": 23732680233600.0, "grad_norm": 1.86347948201136, "language_loss": 0.86139679, "learning_rate": 3.996700910666847e-06, "loss": 0.88441694, "num_input_tokens_seen": 17014760, "step": 794, "time_per_iteration": 2.6835687160491943 }, { "auxiliary_loss_clip": 0.01228204, "auxiliary_loss_mlp": 0.01077895, "balance_loss_clip": 1.05969596, "balance_loss_mlp": 1.04935622, "epoch": 0.04779798587103562, "flos": 23695764030720.0, "grad_norm": 2.370166301863074, "language_loss": 0.69069195, "learning_rate": 3.996678512253272e-06, "loss": 0.71375293, "num_input_tokens_seen": 17032715, "step": 795, "time_per_iteration": 2.669261932373047 }, { "auxiliary_loss_clip": 0.01225748, "auxiliary_loss_mlp": 0.01076275, "balance_loss_clip": 1.06129098, "balance_loss_mlp": 1.04756904, "epoch": 0.04785810912370359, "flos": 23183641872000.0, "grad_norm": 1.744925212230271, "language_loss": 0.810256, "learning_rate": 3.996656038125449e-06, "loss": 0.83327615, "num_input_tokens_seen": 17052215, "step": 796, "time_per_iteration": 2.5800065994262695 }, { "auxiliary_loss_clip": 0.01228235, "auxiliary_loss_mlp": 0.01065433, "balance_loss_clip": 1.06224668, "balance_loss_mlp": 1.03638172, "epoch": 0.047918232376371564, "flos": 18040623707520.0, "grad_norm": 1.979164246440182, "language_loss": 0.8128069, "learning_rate": 3.996633488284228e-06, "loss": 0.83574355, "num_input_tokens_seen": 17069225, "step": 797, "time_per_iteration": 2.58878493309021 }, { "auxiliary_loss_clip": 0.01100259, "auxiliary_loss_mlp": 0.01007215, "balance_loss_clip": 1.02779806, "balance_loss_mlp": 1.00266171, "epoch": 0.04797835562903953, "flos": 62442588758400.0, "grad_norm": 0.912416075283383, "language_loss": 0.64532876, "learning_rate": 3.996610862730465e-06, "loss": 0.66640353, "num_input_tokens_seen": 17126680, "step": 798, "time_per_iteration": 3.0779380798339844 }, { "auxiliary_loss_clip": 0.01229665, "auxiliary_loss_mlp": 0.01068747, "balance_loss_clip": 1.05799031, "balance_loss_mlp": 1.04121017, "epoch": 0.0480384788817075, "flos": 21507296094720.0, "grad_norm": 2.0206600610723333, "language_loss": 0.91274291, "learning_rate": 3.996588161465018e-06, "loss": 0.935727, "num_input_tokens_seen": 17144835, "step": 799, "time_per_iteration": 2.660438299179077 }, { "auxiliary_loss_clip": 0.01230751, "auxiliary_loss_mlp": 0.010715, "balance_loss_clip": 1.06640434, "balance_loss_mlp": 1.04274678, "epoch": 0.048098602134375466, "flos": 21726710323200.0, "grad_norm": 2.0752654205923866, "language_loss": 0.86825287, "learning_rate": 3.996565384488748e-06, "loss": 0.89127541, "num_input_tokens_seen": 17165030, "step": 800, "time_per_iteration": 2.6700456142425537 }, { "auxiliary_loss_clip": 0.01229893, "auxiliary_loss_mlp": 0.01072058, "balance_loss_clip": 1.06186771, "balance_loss_mlp": 1.04618931, "epoch": 0.04815872538704344, "flos": 22931082368640.0, "grad_norm": 2.5310108886746976, "language_loss": 0.83949852, "learning_rate": 3.996542531802518e-06, "loss": 0.86251807, "num_input_tokens_seen": 17184895, "step": 801, "time_per_iteration": 2.7724695205688477 }, { "auxiliary_loss_clip": 0.01227846, "auxiliary_loss_mlp": 0.010756, "balance_loss_clip": 1.06226814, "balance_loss_mlp": 1.04847932, "epoch": 0.04821884863971141, "flos": 43174716042240.0, "grad_norm": 1.9607091513106172, "language_loss": 0.79818648, "learning_rate": 3.996519603407196e-06, "loss": 0.82122099, "num_input_tokens_seen": 17208225, "step": 802, "time_per_iteration": 2.861309766769409 }, { "auxiliary_loss_clip": 0.0122832, "auxiliary_loss_mlp": 0.01069086, "balance_loss_clip": 1.06392837, "balance_loss_mlp": 1.04278886, "epoch": 0.048278971892379376, "flos": 18620006083200.0, "grad_norm": 1.798745906633195, "language_loss": 0.86600745, "learning_rate": 3.996496599303649e-06, "loss": 0.88898146, "num_input_tokens_seen": 17226305, "step": 803, "time_per_iteration": 2.612684965133667 }, { "auxiliary_loss_clip": 0.01222438, "auxiliary_loss_mlp": 0.01063116, "balance_loss_clip": 1.06214345, "balance_loss_mlp": 1.03643703, "epoch": 0.04833909514504735, "flos": 20230061310720.0, "grad_norm": 5.958214069975319, "language_loss": 0.85139012, "learning_rate": 3.996473519492753e-06, "loss": 0.8742457, "num_input_tokens_seen": 17244545, "step": 804, "time_per_iteration": 2.596965789794922 }, { "auxiliary_loss_clip": 0.01225485, "auxiliary_loss_mlp": 0.0106948, "balance_loss_clip": 1.06206632, "balance_loss_mlp": 1.04222918, "epoch": 0.04839921839771532, "flos": 24645170361600.0, "grad_norm": 1.9492340448514227, "language_loss": 0.85939878, "learning_rate": 3.99645036397538e-06, "loss": 0.88234842, "num_input_tokens_seen": 17265730, "step": 805, "time_per_iteration": 2.6773781776428223 }, { "auxiliary_loss_clip": 0.01221339, "auxiliary_loss_mlp": 0.01071867, "balance_loss_clip": 1.05968738, "balance_loss_mlp": 1.04591477, "epoch": 0.048459341650383285, "flos": 24827452905600.0, "grad_norm": 1.8764849579047527, "language_loss": 0.68025368, "learning_rate": 3.9964271327524085e-06, "loss": 0.70318574, "num_input_tokens_seen": 17284820, "step": 806, "time_per_iteration": 2.6270596981048584 }, { "auxiliary_loss_clip": 0.01221043, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.06064904, "balance_loss_mlp": 1.03384972, "epoch": 0.04851946490305126, "flos": 22163204396160.0, "grad_norm": 8.586680684018, "language_loss": 0.76488906, "learning_rate": 3.9964038258247214e-06, "loss": 0.78770459, "num_input_tokens_seen": 17305085, "step": 807, "time_per_iteration": 2.6783089637756348 }, { "auxiliary_loss_clip": 0.01218859, "auxiliary_loss_mlp": 0.01068871, "balance_loss_clip": 1.05734789, "balance_loss_mlp": 1.04290676, "epoch": 0.04857958815571922, "flos": 19792022952960.0, "grad_norm": 2.4056749627509157, "language_loss": 0.86882269, "learning_rate": 3.9963804431932005e-06, "loss": 0.89170003, "num_input_tokens_seen": 17322715, "step": 808, "time_per_iteration": 2.6447641849517822 }, { "auxiliary_loss_clip": 0.01227529, "auxiliary_loss_mlp": 0.01069446, "balance_loss_clip": 1.06140316, "balance_loss_mlp": 1.0424329, "epoch": 0.048639711408387194, "flos": 18697968552960.0, "grad_norm": 2.6040733531164424, "language_loss": 0.89710444, "learning_rate": 3.996356984858732e-06, "loss": 0.92007422, "num_input_tokens_seen": 17341455, "step": 809, "time_per_iteration": 2.6679790019989014 }, { "auxiliary_loss_clip": 0.01226608, "auxiliary_loss_mlp": 0.01067211, "balance_loss_clip": 1.0643065, "balance_loss_mlp": 1.04060316, "epoch": 0.048699834661055166, "flos": 24863507182080.0, "grad_norm": 3.0721319202916324, "language_loss": 0.84918916, "learning_rate": 3.996333450822208e-06, "loss": 0.87212729, "num_input_tokens_seen": 17360765, "step": 810, "time_per_iteration": 2.696772575378418 }, { "auxiliary_loss_clip": 0.01227202, "auxiliary_loss_mlp": 0.01067343, "balance_loss_clip": 1.0622344, "balance_loss_mlp": 1.04049683, "epoch": 0.04875995791372313, "flos": 20704010290560.0, "grad_norm": 1.8136675943398954, "language_loss": 0.80799425, "learning_rate": 3.99630984108452e-06, "loss": 0.83093977, "num_input_tokens_seen": 17380625, "step": 811, "time_per_iteration": 2.653808355331421 }, { "auxiliary_loss_clip": 0.01217843, "auxiliary_loss_mlp": 0.01070621, "balance_loss_clip": 1.05928314, "balance_loss_mlp": 1.04466903, "epoch": 0.048820081166391104, "flos": 18588297352320.0, "grad_norm": 1.7193599003225197, "language_loss": 0.74634516, "learning_rate": 3.9962861556465615e-06, "loss": 0.76922977, "num_input_tokens_seen": 17399355, "step": 812, "time_per_iteration": 2.7274649143218994 }, { "auxiliary_loss_clip": 0.01222659, "auxiliary_loss_mlp": 0.01073562, "balance_loss_clip": 1.06445217, "balance_loss_mlp": 1.04862356, "epoch": 0.04888020441905907, "flos": 22707322594560.0, "grad_norm": 1.9311665765462733, "language_loss": 0.90124279, "learning_rate": 3.996262394509233e-06, "loss": 0.92420495, "num_input_tokens_seen": 17418240, "step": 813, "time_per_iteration": 2.654874801635742 }, { "auxiliary_loss_clip": 0.0122, "auxiliary_loss_mlp": 0.01057827, "balance_loss_clip": 1.06157589, "balance_loss_mlp": 1.03248262, "epoch": 0.04894032767172704, "flos": 22784351310720.0, "grad_norm": 1.9238840150723209, "language_loss": 0.74904704, "learning_rate": 3.9962385576734335e-06, "loss": 0.77182531, "num_input_tokens_seen": 17436250, "step": 814, "time_per_iteration": 2.7381603717803955 }, { "auxiliary_loss_clip": 0.01223782, "auxiliary_loss_mlp": 0.01069686, "balance_loss_clip": 1.06125045, "balance_loss_mlp": 1.04289961, "epoch": 0.04900045092439501, "flos": 25516147345920.0, "grad_norm": 2.1966001004582596, "language_loss": 0.83816808, "learning_rate": 3.9962146451400675e-06, "loss": 0.86110282, "num_input_tokens_seen": 17455750, "step": 815, "time_per_iteration": 2.7289621829986572 }, { "auxiliary_loss_clip": 0.01227011, "auxiliary_loss_mlp": 0.01060571, "balance_loss_clip": 1.06326818, "balance_loss_mlp": 1.0344646, "epoch": 0.04906057417706298, "flos": 25958136199680.0, "grad_norm": 2.3329994981275943, "language_loss": 0.90796101, "learning_rate": 3.996190656910043e-06, "loss": 0.93083686, "num_input_tokens_seen": 17474995, "step": 816, "time_per_iteration": 4.174290180206299 }, { "auxiliary_loss_clip": 0.01226278, "auxiliary_loss_mlp": 0.0105651, "balance_loss_clip": 1.06172895, "balance_loss_mlp": 1.03054583, "epoch": 0.04912069742973095, "flos": 18624638937600.0, "grad_norm": 2.2253098946667853, "language_loss": 0.79834002, "learning_rate": 3.996166592984268e-06, "loss": 0.82116789, "num_input_tokens_seen": 17493395, "step": 817, "time_per_iteration": 4.2819907665252686 }, { "auxiliary_loss_clip": 0.01222491, "auxiliary_loss_mlp": 0.01072358, "balance_loss_clip": 1.06228495, "balance_loss_mlp": 1.04563141, "epoch": 0.049180820682398915, "flos": 23699786353920.0, "grad_norm": 1.9292138186207266, "language_loss": 0.8532303, "learning_rate": 3.996142453363656e-06, "loss": 0.8761788, "num_input_tokens_seen": 17514565, "step": 818, "time_per_iteration": 7.687308073043823 }, { "auxiliary_loss_clip": 0.01228571, "auxiliary_loss_mlp": 0.01064433, "balance_loss_clip": 1.06170368, "balance_loss_mlp": 1.0369786, "epoch": 0.04924094393506689, "flos": 22420396753920.0, "grad_norm": 2.1064810754058407, "language_loss": 0.75623614, "learning_rate": 3.996118238049124e-06, "loss": 0.77916616, "num_input_tokens_seen": 17534590, "step": 819, "time_per_iteration": 2.5708072185516357 }, { "auxiliary_loss_clip": 0.01227988, "auxiliary_loss_mlp": 0.010616, "balance_loss_clip": 1.06580663, "balance_loss_mlp": 1.03785336, "epoch": 0.04930106718773486, "flos": 15738246766080.0, "grad_norm": 2.8685299631500487, "language_loss": 0.85082126, "learning_rate": 3.996093947041586e-06, "loss": 0.87371719, "num_input_tokens_seen": 17551900, "step": 820, "time_per_iteration": 2.695204973220825 }, { "auxiliary_loss_clip": 0.01224953, "auxiliary_loss_mlp": 0.01065985, "balance_loss_clip": 1.06082845, "balance_loss_mlp": 1.04037917, "epoch": 0.049361190440402825, "flos": 26250628648320.0, "grad_norm": 1.734636988660555, "language_loss": 0.90459162, "learning_rate": 3.996069580341966e-06, "loss": 0.92750102, "num_input_tokens_seen": 17571485, "step": 821, "time_per_iteration": 2.6284992694854736 }, { "auxiliary_loss_clip": 0.01222526, "auxiliary_loss_mlp": 0.01080357, "balance_loss_clip": 1.06015635, "balance_loss_mlp": 1.05485809, "epoch": 0.0494213136930708, "flos": 21252366293760.0, "grad_norm": 1.7915267676548876, "language_loss": 0.89795959, "learning_rate": 3.996045137951188e-06, "loss": 0.92098844, "num_input_tokens_seen": 17591410, "step": 822, "time_per_iteration": 2.6085855960845947 }, { "auxiliary_loss_clip": 0.0122571, "auxiliary_loss_mlp": 0.01062887, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 1.03472972, "epoch": 0.04948143694573876, "flos": 27965506740480.0, "grad_norm": 2.28747155105076, "language_loss": 0.67558801, "learning_rate": 3.996020619870178e-06, "loss": 0.69847399, "num_input_tokens_seen": 17612010, "step": 823, "time_per_iteration": 2.644277572631836 }, { "auxiliary_loss_clip": 0.01099376, "auxiliary_loss_mlp": 0.0100741, "balance_loss_clip": 1.0267303, "balance_loss_mlp": 1.00266516, "epoch": 0.049541560198406734, "flos": 66180995533440.0, "grad_norm": 1.3456360586087317, "language_loss": 0.62254131, "learning_rate": 3.995996026099866e-06, "loss": 0.64360917, "num_input_tokens_seen": 17673430, "step": 824, "time_per_iteration": 3.230381488800049 }, { "auxiliary_loss_clip": 0.01228758, "auxiliary_loss_mlp": 0.01066541, "balance_loss_clip": 1.06346989, "balance_loss_mlp": 1.03909945, "epoch": 0.049601683451074706, "flos": 22892693708160.0, "grad_norm": 1.8854339538524305, "language_loss": 0.90479428, "learning_rate": 3.995971356641185e-06, "loss": 0.92774737, "num_input_tokens_seen": 17689545, "step": 825, "time_per_iteration": 2.58868670463562 }, { "auxiliary_loss_clip": 0.01227734, "auxiliary_loss_mlp": 0.01066527, "balance_loss_clip": 1.06315517, "balance_loss_mlp": 1.03844118, "epoch": 0.04966180670374267, "flos": 21433643256960.0, "grad_norm": 2.307419213246734, "language_loss": 0.66851091, "learning_rate": 3.9959466114950695e-06, "loss": 0.69145352, "num_input_tokens_seen": 17705965, "step": 826, "time_per_iteration": 2.59468412399292 }, { "auxiliary_loss_clip": 0.01230149, "auxiliary_loss_mlp": 0.01069061, "balance_loss_clip": 1.06421614, "balance_loss_mlp": 1.04216766, "epoch": 0.04972192995641064, "flos": 23107367341440.0, "grad_norm": 1.8316571551414482, "language_loss": 0.78298402, "learning_rate": 3.995921790662459e-06, "loss": 0.80597603, "num_input_tokens_seen": 17724580, "step": 827, "time_per_iteration": 2.7148005962371826 }, { "auxiliary_loss_clip": 0.01230507, "auxiliary_loss_mlp": 0.01079145, "balance_loss_clip": 1.06385946, "balance_loss_mlp": 1.05119085, "epoch": 0.04978205320907861, "flos": 40406147458560.0, "grad_norm": 1.6017511297862308, "language_loss": 0.78696525, "learning_rate": 3.995896894144294e-06, "loss": 0.81006181, "num_input_tokens_seen": 17747755, "step": 828, "time_per_iteration": 2.86991548538208 }, { "auxiliary_loss_clip": 0.0121958, "auxiliary_loss_mlp": 0.01059689, "balance_loss_clip": 1.05939984, "balance_loss_mlp": 1.03390431, "epoch": 0.04984217646174658, "flos": 25228539146880.0, "grad_norm": 2.48577103336206, "language_loss": 0.83530867, "learning_rate": 3.995871921941519e-06, "loss": 0.85810131, "num_input_tokens_seen": 17768550, "step": 829, "time_per_iteration": 2.655895948410034 }, { "auxiliary_loss_clip": 0.01226863, "auxiliary_loss_mlp": 0.01080723, "balance_loss_clip": 1.06109536, "balance_loss_mlp": 1.05068195, "epoch": 0.04990229971441455, "flos": 15959636242560.0, "grad_norm": 2.078538436430036, "language_loss": 0.74857247, "learning_rate": 3.99584687405508e-06, "loss": 0.77164829, "num_input_tokens_seen": 17786080, "step": 830, "time_per_iteration": 2.5820400714874268 }, { "auxiliary_loss_clip": 0.0122584, "auxiliary_loss_mlp": 0.01074077, "balance_loss_clip": 1.06154907, "balance_loss_mlp": 1.04667115, "epoch": 0.04996242296708252, "flos": 18405116968320.0, "grad_norm": 1.8327841960194244, "language_loss": 0.79279459, "learning_rate": 3.995821750485929e-06, "loss": 0.81579381, "num_input_tokens_seen": 17803635, "step": 831, "time_per_iteration": 2.5980231761932373 }, { "auxiliary_loss_clip": 0.01173206, "auxiliary_loss_mlp": 0.01072743, "balance_loss_clip": 1.0542444, "balance_loss_mlp": 1.04725623, "epoch": 0.05002254621975049, "flos": 17858053854720.0, "grad_norm": 3.034319898285603, "language_loss": 0.91497368, "learning_rate": 3.995796551235016e-06, "loss": 0.93743312, "num_input_tokens_seen": 17822190, "step": 832, "time_per_iteration": 2.7498815059661865 }, { "auxiliary_loss_clip": 0.01194428, "auxiliary_loss_mlp": 0.01081719, "balance_loss_clip": 1.05826366, "balance_loss_mlp": 1.05667353, "epoch": 0.050082669472418455, "flos": 45660273367680.0, "grad_norm": 1.887029338258115, "language_loss": 0.83167893, "learning_rate": 3.9957712763032974e-06, "loss": 0.85444039, "num_input_tokens_seen": 17846915, "step": 833, "time_per_iteration": 2.863208770751953 }, { "auxiliary_loss_clip": 0.01199525, "auxiliary_loss_mlp": 0.01061962, "balance_loss_clip": 1.05888343, "balance_loss_mlp": 1.03468657, "epoch": 0.05014279272508643, "flos": 37962067363200.0, "grad_norm": 2.8753922020214033, "language_loss": 0.82409853, "learning_rate": 3.995745925691733e-06, "loss": 0.84671336, "num_input_tokens_seen": 17867270, "step": 834, "time_per_iteration": 2.7868030071258545 }, { "auxiliary_loss_clip": 0.01216246, "auxiliary_loss_mlp": 0.01064427, "balance_loss_clip": 1.06272483, "balance_loss_mlp": 1.03672278, "epoch": 0.0502029159777544, "flos": 20996179516800.0, "grad_norm": 2.2306487397141646, "language_loss": 0.92186153, "learning_rate": 3.995720499401282e-06, "loss": 0.94466823, "num_input_tokens_seen": 17884880, "step": 835, "time_per_iteration": 2.6224496364593506 }, { "auxiliary_loss_clip": 0.01229494, "auxiliary_loss_mlp": 0.01074922, "balance_loss_clip": 1.06143415, "balance_loss_mlp": 1.0464313, "epoch": 0.050263039230422364, "flos": 15888066393600.0, "grad_norm": 2.196832783808158, "language_loss": 0.76143622, "learning_rate": 3.995694997432911e-06, "loss": 0.78448039, "num_input_tokens_seen": 17903695, "step": 836, "time_per_iteration": 2.5648462772369385 }, { "auxiliary_loss_clip": 0.01211162, "auxiliary_loss_mlp": 0.01075977, "balance_loss_clip": 1.06259084, "balance_loss_mlp": 1.04992962, "epoch": 0.050323162483090336, "flos": 23732752060800.0, "grad_norm": 2.100773352560791, "language_loss": 0.83627856, "learning_rate": 3.9956694197875855e-06, "loss": 0.85914999, "num_input_tokens_seen": 17920745, "step": 837, "time_per_iteration": 2.7420156002044678 }, { "auxiliary_loss_clip": 0.01198815, "auxiliary_loss_mlp": 0.0078439, "balance_loss_clip": 1.06345344, "balance_loss_mlp": 1.00053763, "epoch": 0.0503832857357583, "flos": 20266223328000.0, "grad_norm": 2.1353335821274477, "language_loss": 0.72857559, "learning_rate": 3.995643766466275e-06, "loss": 0.7484076, "num_input_tokens_seen": 17938220, "step": 838, "time_per_iteration": 2.679177761077881 }, { "auxiliary_loss_clip": 0.01189223, "auxiliary_loss_mlp": 0.01071526, "balance_loss_clip": 1.05415273, "balance_loss_mlp": 1.04510927, "epoch": 0.05044340898842627, "flos": 17785011548160.0, "grad_norm": 1.8138261016039334, "language_loss": 0.83462799, "learning_rate": 3.995618037469953e-06, "loss": 0.85723549, "num_input_tokens_seen": 17957325, "step": 839, "time_per_iteration": 2.69063663482666 }, { "auxiliary_loss_clip": 0.01220356, "auxiliary_loss_mlp": 0.01069331, "balance_loss_clip": 1.05991399, "balance_loss_mlp": 1.04411805, "epoch": 0.050503532241094246, "flos": 22966526113920.0, "grad_norm": 1.7513762525269907, "language_loss": 0.85775483, "learning_rate": 3.995592232799595e-06, "loss": 0.88065171, "num_input_tokens_seen": 17975875, "step": 840, "time_per_iteration": 2.6477303504943848 }, { "auxiliary_loss_clip": 0.01192112, "auxiliary_loss_mlp": 0.01064377, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.036291, "epoch": 0.05056365549376221, "flos": 22776989022720.0, "grad_norm": 1.7956760046069329, "language_loss": 0.9457823, "learning_rate": 3.99556635245618e-06, "loss": 0.96834719, "num_input_tokens_seen": 17994340, "step": 841, "time_per_iteration": 2.8354220390319824 }, { "auxiliary_loss_clip": 0.0122473, "auxiliary_loss_mlp": 0.01070125, "balance_loss_clip": 1.06219172, "balance_loss_mlp": 1.04329097, "epoch": 0.05062377874643018, "flos": 30916968399360.0, "grad_norm": 2.3106044659054104, "language_loss": 0.77566791, "learning_rate": 3.995540396440688e-06, "loss": 0.79861641, "num_input_tokens_seen": 18015260, "step": 842, "time_per_iteration": 2.6909749507904053 }, { "auxiliary_loss_clip": 0.01214637, "auxiliary_loss_mlp": 0.01071033, "balance_loss_clip": 1.06270838, "balance_loss_mlp": 1.04391265, "epoch": 0.05068390199909815, "flos": 19647159402240.0, "grad_norm": 2.8849837971101864, "language_loss": 0.78126526, "learning_rate": 3.995514364754105e-06, "loss": 0.80412203, "num_input_tokens_seen": 18033960, "step": 843, "time_per_iteration": 2.6534156799316406 }, { "auxiliary_loss_clip": 0.01212948, "auxiliary_loss_mlp": 0.01063612, "balance_loss_clip": 1.06317043, "balance_loss_mlp": 1.03894806, "epoch": 0.05074402525176612, "flos": 37962103276800.0, "grad_norm": 1.9320015451631862, "language_loss": 0.83256191, "learning_rate": 3.995488257397417e-06, "loss": 0.85532749, "num_input_tokens_seen": 18056700, "step": 844, "time_per_iteration": 2.7682149410247803 }, { "auxiliary_loss_clip": 0.01216308, "auxiliary_loss_mlp": 0.01067162, "balance_loss_clip": 1.06307864, "balance_loss_mlp": 1.04138875, "epoch": 0.05080414850443409, "flos": 22054610603520.0, "grad_norm": 2.113957107027846, "language_loss": 0.77108061, "learning_rate": 3.995462074371614e-06, "loss": 0.79391527, "num_input_tokens_seen": 18075815, "step": 845, "time_per_iteration": 2.6720399856567383 }, { "auxiliary_loss_clip": 0.01206643, "auxiliary_loss_mlp": 0.01065522, "balance_loss_clip": 1.05881417, "balance_loss_mlp": 1.03885484, "epoch": 0.05086427175710206, "flos": 20225787592320.0, "grad_norm": 1.8497392628450484, "language_loss": 0.87773871, "learning_rate": 3.99543581567769e-06, "loss": 0.90046036, "num_input_tokens_seen": 18095095, "step": 846, "time_per_iteration": 2.696049690246582 }, { "auxiliary_loss_clip": 0.01206291, "auxiliary_loss_mlp": 0.01069231, "balance_loss_clip": 1.06204462, "balance_loss_mlp": 1.04330277, "epoch": 0.05092439500977003, "flos": 15159223526400.0, "grad_norm": 1.695550491545423, "language_loss": 0.87364423, "learning_rate": 3.9954094813166394e-06, "loss": 0.89639944, "num_input_tokens_seen": 18112675, "step": 847, "time_per_iteration": 2.666907548904419 }, { "auxiliary_loss_clip": 0.01175052, "auxiliary_loss_mlp": 0.01071976, "balance_loss_clip": 1.06267309, "balance_loss_mlp": 1.0447005, "epoch": 0.050984518262437994, "flos": 22055149307520.0, "grad_norm": 2.5687168450386637, "language_loss": 0.81878662, "learning_rate": 3.995383071289462e-06, "loss": 0.84125686, "num_input_tokens_seen": 18130745, "step": 848, "time_per_iteration": 2.782135486602783 }, { "auxiliary_loss_clip": 0.0122638, "auxiliary_loss_mlp": 0.01071388, "balance_loss_clip": 1.06619906, "balance_loss_mlp": 1.04544854, "epoch": 0.05104464151510597, "flos": 30225329043840.0, "grad_norm": 1.678404869397893, "language_loss": 0.87187904, "learning_rate": 3.995356585597158e-06, "loss": 0.89485669, "num_input_tokens_seen": 18152410, "step": 849, "time_per_iteration": 2.787992000579834 }, { "auxiliary_loss_clip": 0.01220251, "auxiliary_loss_mlp": 0.0106131, "balance_loss_clip": 1.06049275, "balance_loss_mlp": 1.03545308, "epoch": 0.05110476476777394, "flos": 18332900674560.0, "grad_norm": 2.125711462362114, "language_loss": 0.8315587, "learning_rate": 3.995330024240732e-06, "loss": 0.85437429, "num_input_tokens_seen": 18170870, "step": 850, "time_per_iteration": 2.6548752784729004 }, { "auxiliary_loss_clip": 0.01210598, "auxiliary_loss_mlp": 0.01063491, "balance_loss_clip": 1.06061506, "balance_loss_mlp": 1.0379566, "epoch": 0.051164888020441904, "flos": 37998732170880.0, "grad_norm": 2.2115645013354253, "language_loss": 0.65423882, "learning_rate": 3.995303387221192e-06, "loss": 0.67697972, "num_input_tokens_seen": 18191555, "step": 851, "time_per_iteration": 2.817197322845459 }, { "auxiliary_loss_clip": 0.0120566, "auxiliary_loss_mlp": 0.01075745, "balance_loss_clip": 1.05822444, "balance_loss_mlp": 1.04761147, "epoch": 0.051225011273109876, "flos": 23038634666880.0, "grad_norm": 2.3720786299251073, "language_loss": 0.83587611, "learning_rate": 3.995276674539547e-06, "loss": 0.8586902, "num_input_tokens_seen": 18208620, "step": 852, "time_per_iteration": 2.685727119445801 }, { "auxiliary_loss_clip": 0.01193575, "auxiliary_loss_mlp": 0.01074152, "balance_loss_clip": 1.05924761, "balance_loss_mlp": 1.04737723, "epoch": 0.05128513452577785, "flos": 18259822454400.0, "grad_norm": 2.1832763559951234, "language_loss": 0.80761266, "learning_rate": 3.995249886196811e-06, "loss": 0.8302899, "num_input_tokens_seen": 18226370, "step": 853, "time_per_iteration": 2.6078240871429443 }, { "auxiliary_loss_clip": 0.01222394, "auxiliary_loss_mlp": 0.01065268, "balance_loss_clip": 1.06223083, "balance_loss_mlp": 1.03780222, "epoch": 0.05134525777844581, "flos": 27198957571200.0, "grad_norm": 1.8511550328562763, "language_loss": 0.75617325, "learning_rate": 3.995223022193999e-06, "loss": 0.77904987, "num_input_tokens_seen": 18247075, "step": 854, "time_per_iteration": 2.633543014526367 }, { "auxiliary_loss_clip": 0.01202415, "auxiliary_loss_mlp": 0.01065973, "balance_loss_clip": 1.06141627, "balance_loss_mlp": 1.03828049, "epoch": 0.051405381031113785, "flos": 28362247436160.0, "grad_norm": 2.04057054323539, "language_loss": 0.81722355, "learning_rate": 3.99519608253213e-06, "loss": 0.83990741, "num_input_tokens_seen": 18265680, "step": 855, "time_per_iteration": 2.760880708694458 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.00762392, "balance_loss_clip": 1.0358243, "balance_loss_mlp": 1.00074518, "epoch": 0.05146550428378175, "flos": 65618169327360.0, "grad_norm": 0.9894594919315515, "language_loss": 0.65634769, "learning_rate": 3.995169067212227e-06, "loss": 0.67473871, "num_input_tokens_seen": 18327015, "step": 856, "time_per_iteration": 6.271182298660278 }, { "auxiliary_loss_clip": 0.01194232, "auxiliary_loss_mlp": 0.01056626, "balance_loss_clip": 1.05972147, "balance_loss_mlp": 1.02994716, "epoch": 0.05152562753644972, "flos": 22054861998720.0, "grad_norm": 1.8001295724347575, "language_loss": 0.77139348, "learning_rate": 3.9951419762353116e-06, "loss": 0.79390204, "num_input_tokens_seen": 18345235, "step": 857, "time_per_iteration": 4.905239582061768 }, { "auxiliary_loss_clip": 0.01183581, "auxiliary_loss_mlp": 0.01059685, "balance_loss_clip": 1.05640614, "balance_loss_mlp": 1.03291047, "epoch": 0.051585750789117694, "flos": 18509544783360.0, "grad_norm": 2.111656321737554, "language_loss": 0.89194518, "learning_rate": 3.995114809602412e-06, "loss": 0.91437781, "num_input_tokens_seen": 18362350, "step": 858, "time_per_iteration": 2.7349045276641846 }, { "auxiliary_loss_clip": 0.01196113, "auxiliary_loss_mlp": 0.01060739, "balance_loss_clip": 1.06114125, "balance_loss_mlp": 1.03398848, "epoch": 0.05164587404178566, "flos": 23730238108800.0, "grad_norm": 2.030377637624243, "language_loss": 0.75684321, "learning_rate": 3.9950875673145605e-06, "loss": 0.77941179, "num_input_tokens_seen": 18383390, "step": 859, "time_per_iteration": 2.7611751556396484 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.0107269, "balance_loss_clip": 1.05708003, "balance_loss_mlp": 1.04354358, "epoch": 0.05170599729445363, "flos": 16252882876800.0, "grad_norm": 2.134655488493178, "language_loss": 0.91122925, "learning_rate": 3.995060249372788e-06, "loss": 0.93381929, "num_input_tokens_seen": 18399220, "step": 860, "time_per_iteration": 2.666740894317627 }, { "auxiliary_loss_clip": 0.0122488, "auxiliary_loss_mlp": 0.01060586, "balance_loss_clip": 1.06531346, "balance_loss_mlp": 1.03536153, "epoch": 0.0517661205471216, "flos": 23985922095360.0, "grad_norm": 1.7954568874114027, "language_loss": 0.82378531, "learning_rate": 3.99503285577813e-06, "loss": 0.84663993, "num_input_tokens_seen": 18419005, "step": 861, "time_per_iteration": 2.6337814331054688 }, { "auxiliary_loss_clip": 0.01198486, "auxiliary_loss_mlp": 0.01060236, "balance_loss_clip": 1.06147969, "balance_loss_mlp": 1.03437924, "epoch": 0.05182624379978957, "flos": 29277718392960.0, "grad_norm": 2.5785699637959776, "language_loss": 0.78664875, "learning_rate": 3.995005386531627e-06, "loss": 0.80923599, "num_input_tokens_seen": 18440550, "step": 862, "time_per_iteration": 2.7570109367370605 }, { "auxiliary_loss_clip": 0.01189664, "auxiliary_loss_mlp": 0.01070327, "balance_loss_clip": 1.058797, "balance_loss_mlp": 1.04547238, "epoch": 0.05188636705245754, "flos": 24170826332160.0, "grad_norm": 1.7880881456146414, "language_loss": 0.89090264, "learning_rate": 3.9949778416343195e-06, "loss": 0.91350257, "num_input_tokens_seen": 18461950, "step": 863, "time_per_iteration": 2.7118866443634033 }, { "auxiliary_loss_clip": 0.01201772, "auxiliary_loss_mlp": 0.01064316, "balance_loss_clip": 1.06488204, "balance_loss_mlp": 1.0369451, "epoch": 0.051946490305125506, "flos": 26760703731840.0, "grad_norm": 2.081656150811602, "language_loss": 0.76119763, "learning_rate": 3.9949502210872525e-06, "loss": 0.78385854, "num_input_tokens_seen": 18480555, "step": 864, "time_per_iteration": 2.6946637630462646 }, { "auxiliary_loss_clip": 0.01186585, "auxiliary_loss_mlp": 0.01067959, "balance_loss_clip": 1.05559874, "balance_loss_mlp": 1.04046965, "epoch": 0.05200661355779348, "flos": 21502519585920.0, "grad_norm": 1.9374308734697678, "language_loss": 0.7908361, "learning_rate": 3.994922524891474e-06, "loss": 0.81338149, "num_input_tokens_seen": 18499645, "step": 865, "time_per_iteration": 2.7700579166412354 }, { "auxiliary_loss_clip": 0.01210067, "auxiliary_loss_mlp": 0.01067568, "balance_loss_clip": 1.06164694, "balance_loss_mlp": 1.04152083, "epoch": 0.05206673681046144, "flos": 18114492026880.0, "grad_norm": 2.269489500676155, "language_loss": 0.85860598, "learning_rate": 3.994894753048032e-06, "loss": 0.88138229, "num_input_tokens_seen": 18516810, "step": 866, "time_per_iteration": 2.659614086151123 }, { "auxiliary_loss_clip": 0.01186536, "auxiliary_loss_mlp": 0.01070465, "balance_loss_clip": 1.06327558, "balance_loss_mlp": 1.04371393, "epoch": 0.052126860063129415, "flos": 17524191916800.0, "grad_norm": 2.1733876112564565, "language_loss": 0.87495244, "learning_rate": 3.9948669055579815e-06, "loss": 0.89752245, "num_input_tokens_seen": 18532510, "step": 867, "time_per_iteration": 2.740238904953003 }, { "auxiliary_loss_clip": 0.01167585, "auxiliary_loss_mlp": 0.01078445, "balance_loss_clip": 1.05696058, "balance_loss_mlp": 1.05437636, "epoch": 0.05218698331579739, "flos": 32598054771840.0, "grad_norm": 1.8498678854952728, "language_loss": 0.63917863, "learning_rate": 3.9948389824223785e-06, "loss": 0.66163892, "num_input_tokens_seen": 18557380, "step": 868, "time_per_iteration": 2.9310383796691895 }, { "auxiliary_loss_clip": 0.01225135, "auxiliary_loss_mlp": 0.01069894, "balance_loss_clip": 1.06287289, "balance_loss_mlp": 1.04173636, "epoch": 0.05224710656846535, "flos": 22127293774080.0, "grad_norm": 2.742912036955754, "language_loss": 0.83379138, "learning_rate": 3.994810983642281e-06, "loss": 0.85674161, "num_input_tokens_seen": 18575720, "step": 869, "time_per_iteration": 2.6453137397766113 }, { "auxiliary_loss_clip": 0.01216406, "auxiliary_loss_mlp": 0.01056401, "balance_loss_clip": 1.0645746, "balance_loss_mlp": 1.03053236, "epoch": 0.052307229821133325, "flos": 11145092976000.0, "grad_norm": 2.188953802542244, "language_loss": 0.87822217, "learning_rate": 3.994782909218751e-06, "loss": 0.90095031, "num_input_tokens_seen": 18592185, "step": 870, "time_per_iteration": 2.7044875621795654 }, { "auxiliary_loss_clip": 0.01226316, "auxiliary_loss_mlp": 0.01064746, "balance_loss_clip": 1.06603277, "balance_loss_mlp": 1.03965199, "epoch": 0.05236735307380129, "flos": 19128070005120.0, "grad_norm": 1.975067156516721, "language_loss": 0.80651748, "learning_rate": 3.994754759152854e-06, "loss": 0.82942802, "num_input_tokens_seen": 18609560, "step": 871, "time_per_iteration": 2.6892175674438477 }, { "auxiliary_loss_clip": 0.0119502, "auxiliary_loss_mlp": 0.01064309, "balance_loss_clip": 1.0650804, "balance_loss_mlp": 1.0396452, "epoch": 0.05242747632646926, "flos": 20960663944320.0, "grad_norm": 1.7402390708810018, "language_loss": 0.81330585, "learning_rate": 3.994726533445656e-06, "loss": 0.83589917, "num_input_tokens_seen": 18629405, "step": 872, "time_per_iteration": 2.8044185638427734 }, { "auxiliary_loss_clip": 0.0107835, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.03168392, "balance_loss_mlp": 1.01515913, "epoch": 0.052487599579137234, "flos": 65020542842880.0, "grad_norm": 0.883483589670371, "language_loss": 0.61589074, "learning_rate": 3.9946982320982274e-06, "loss": 0.63688087, "num_input_tokens_seen": 18681480, "step": 873, "time_per_iteration": 3.1711297035217285 }, { "auxiliary_loss_clip": 0.01197438, "auxiliary_loss_mlp": 0.01056818, "balance_loss_clip": 1.06202292, "balance_loss_mlp": 1.03120041, "epoch": 0.0525477228318052, "flos": 23288859786240.0, "grad_norm": 2.1995328011281488, "language_loss": 0.88965189, "learning_rate": 3.994669855111643e-06, "loss": 0.91219449, "num_input_tokens_seen": 18700390, "step": 874, "time_per_iteration": 2.8240153789520264 }, { "auxiliary_loss_clip": 0.01197247, "auxiliary_loss_mlp": 0.01063458, "balance_loss_clip": 1.0614326, "balance_loss_mlp": 1.03682709, "epoch": 0.05260784608447317, "flos": 32230221546240.0, "grad_norm": 1.858649685360537, "language_loss": 0.74537963, "learning_rate": 3.994641402486977e-06, "loss": 0.76798666, "num_input_tokens_seen": 18721280, "step": 875, "time_per_iteration": 2.9111931324005127 }, { "auxiliary_loss_clip": 0.01206205, "auxiliary_loss_mlp": 0.01058912, "balance_loss_clip": 1.06306934, "balance_loss_mlp": 1.03210175, "epoch": 0.052667969337141136, "flos": 24463211040000.0, "grad_norm": 1.7697857141051123, "language_loss": 0.92843151, "learning_rate": 3.99461287422531e-06, "loss": 0.95108265, "num_input_tokens_seen": 18741545, "step": 876, "time_per_iteration": 2.800252676010132 }, { "auxiliary_loss_clip": 0.01100151, "auxiliary_loss_mlp": 0.01006341, "balance_loss_clip": 1.02669787, "balance_loss_mlp": 1.0020256, "epoch": 0.05272809258980911, "flos": 57784329567360.0, "grad_norm": 0.8383495859932864, "language_loss": 0.62929404, "learning_rate": 3.994584270327722e-06, "loss": 0.65035897, "num_input_tokens_seen": 18801400, "step": 877, "time_per_iteration": 3.2090368270874023 }, { "auxiliary_loss_clip": 0.01200578, "auxiliary_loss_mlp": 0.0106702, "balance_loss_clip": 1.06150424, "balance_loss_mlp": 1.03931606, "epoch": 0.05278821584247708, "flos": 17420805596160.0, "grad_norm": 2.042786693643985, "language_loss": 0.85383844, "learning_rate": 3.994555590795299e-06, "loss": 0.87651443, "num_input_tokens_seen": 18819670, "step": 878, "time_per_iteration": 2.823835849761963 }, { "auxiliary_loss_clip": 0.0122514, "auxiliary_loss_mlp": 0.01061117, "balance_loss_clip": 1.0635035, "balance_loss_mlp": 1.03551078, "epoch": 0.052848339095145046, "flos": 26137258346880.0, "grad_norm": 1.7462717669338121, "language_loss": 0.83076209, "learning_rate": 3.9945268356291275e-06, "loss": 0.8536247, "num_input_tokens_seen": 18840580, "step": 879, "time_per_iteration": 2.743673086166382 }, { "auxiliary_loss_clip": 0.0119139, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.06152987, "balance_loss_mlp": 1.04013824, "epoch": 0.05290846234781302, "flos": 16472081623680.0, "grad_norm": 1.9601789563010765, "language_loss": 0.84284604, "learning_rate": 3.9944980048302985e-06, "loss": 0.86542469, "num_input_tokens_seen": 18859295, "step": 880, "time_per_iteration": 2.7560529708862305 }, { "auxiliary_loss_clip": 0.01184956, "auxiliary_loss_mlp": 0.01065063, "balance_loss_clip": 1.05969453, "balance_loss_mlp": 1.03887296, "epoch": 0.05296858560048098, "flos": 19865173000320.0, "grad_norm": 2.4477328752698564, "language_loss": 0.86870736, "learning_rate": 3.994469098399906e-06, "loss": 0.89120758, "num_input_tokens_seen": 18877485, "step": 881, "time_per_iteration": 2.855395555496216 }, { "auxiliary_loss_clip": 0.01207858, "auxiliary_loss_mlp": 0.01070235, "balance_loss_clip": 1.05984437, "balance_loss_mlp": 1.04238808, "epoch": 0.053028708853148955, "flos": 24388588535040.0, "grad_norm": 1.7611192020675561, "language_loss": 0.87967896, "learning_rate": 3.994440116339046e-06, "loss": 0.90245986, "num_input_tokens_seen": 18898275, "step": 882, "time_per_iteration": 2.8480119705200195 }, { "auxiliary_loss_clip": 0.01224906, "auxiliary_loss_mlp": 0.01057944, "balance_loss_clip": 1.06268644, "balance_loss_mlp": 1.03059733, "epoch": 0.05308883210581693, "flos": 36393166143360.0, "grad_norm": 2.3555018967788635, "language_loss": 0.69469339, "learning_rate": 3.994411058648816e-06, "loss": 0.71752191, "num_input_tokens_seen": 18920665, "step": 883, "time_per_iteration": 2.8808236122131348 }, { "auxiliary_loss_clip": 0.01166777, "auxiliary_loss_mlp": 0.01063991, "balance_loss_clip": 1.05333591, "balance_loss_mlp": 1.03855157, "epoch": 0.05314895535848489, "flos": 22855095146880.0, "grad_norm": 2.039016812023355, "language_loss": 0.76100993, "learning_rate": 3.994381925330319e-06, "loss": 0.78331757, "num_input_tokens_seen": 18939835, "step": 884, "time_per_iteration": 2.8462212085723877 }, { "auxiliary_loss_clip": 0.01172569, "auxiliary_loss_mlp": 0.01066856, "balance_loss_clip": 1.06269383, "balance_loss_mlp": 1.04147613, "epoch": 0.053209078611152864, "flos": 12860330204160.0, "grad_norm": 1.9865896222141148, "language_loss": 0.86195529, "learning_rate": 3.994352716384659e-06, "loss": 0.88434947, "num_input_tokens_seen": 18958405, "step": 885, "time_per_iteration": 2.7825753688812256 }, { "auxiliary_loss_clip": 0.0118405, "auxiliary_loss_mlp": 0.01068976, "balance_loss_clip": 1.05229151, "balance_loss_mlp": 1.04203486, "epoch": 0.05326920186382083, "flos": 12164596698240.0, "grad_norm": 2.608647457747672, "language_loss": 0.85971159, "learning_rate": 3.994323431812945e-06, "loss": 0.88224185, "num_input_tokens_seen": 18975445, "step": 886, "time_per_iteration": 2.7393639087677 }, { "auxiliary_loss_clip": 0.0117343, "auxiliary_loss_mlp": 0.01065966, "balance_loss_clip": 1.05620933, "balance_loss_mlp": 1.03879774, "epoch": 0.0533293251164888, "flos": 22704485420160.0, "grad_norm": 2.040002880698432, "language_loss": 0.8961553, "learning_rate": 3.994294071616286e-06, "loss": 0.91854936, "num_input_tokens_seen": 18991930, "step": 887, "time_per_iteration": 2.8606581687927246 }, { "auxiliary_loss_clip": 0.01144444, "auxiliary_loss_mlp": 0.01072438, "balance_loss_clip": 1.04453194, "balance_loss_mlp": 1.04411352, "epoch": 0.053389448369156774, "flos": 26940939200640.0, "grad_norm": 2.062562868466936, "language_loss": 0.74852538, "learning_rate": 3.994264635795796e-06, "loss": 0.77069414, "num_input_tokens_seen": 19009790, "step": 888, "time_per_iteration": 2.8675312995910645 }, { "auxiliary_loss_clip": 0.01164085, "auxiliary_loss_mlp": 0.01072324, "balance_loss_clip": 1.05659473, "balance_loss_mlp": 1.04525173, "epoch": 0.05344957162182474, "flos": 25556331686400.0, "grad_norm": 1.7884280759117637, "language_loss": 0.88440782, "learning_rate": 3.994235124352592e-06, "loss": 0.9067719, "num_input_tokens_seen": 19030170, "step": 889, "time_per_iteration": 2.9419636726379395 }, { "auxiliary_loss_clip": 0.0121577, "auxiliary_loss_mlp": 0.0105125, "balance_loss_clip": 1.06085157, "balance_loss_mlp": 1.02607334, "epoch": 0.05350969487449271, "flos": 19719591177600.0, "grad_norm": 1.9333059575084248, "language_loss": 0.88386381, "learning_rate": 3.994205537287791e-06, "loss": 0.90653402, "num_input_tokens_seen": 19048075, "step": 890, "time_per_iteration": 2.7030327320098877 }, { "auxiliary_loss_clip": 0.01195034, "auxiliary_loss_mlp": 0.01069003, "balance_loss_clip": 1.05835462, "balance_loss_mlp": 1.04450595, "epoch": 0.053569818127160676, "flos": 27016351804800.0, "grad_norm": 2.435204176890571, "language_loss": 0.93450797, "learning_rate": 3.994175874602517e-06, "loss": 0.95714831, "num_input_tokens_seen": 19067465, "step": 891, "time_per_iteration": 2.81527042388916 }, { "auxiliary_loss_clip": 0.01190797, "auxiliary_loss_mlp": 0.01066955, "balance_loss_clip": 1.05605483, "balance_loss_mlp": 1.03909576, "epoch": 0.05362994137982865, "flos": 13188338225280.0, "grad_norm": 2.3400199158693087, "language_loss": 0.71625131, "learning_rate": 3.994146136297893e-06, "loss": 0.73882878, "num_input_tokens_seen": 19085505, "step": 892, "time_per_iteration": 2.825984239578247 }, { "auxiliary_loss_clip": 0.01191313, "auxiliary_loss_mlp": 0.0078394, "balance_loss_clip": 1.05727172, "balance_loss_mlp": 1.00024366, "epoch": 0.05369006463249662, "flos": 28658008022400.0, "grad_norm": 1.6058100223173828, "language_loss": 0.82331586, "learning_rate": 3.994116322375049e-06, "loss": 0.84306836, "num_input_tokens_seen": 19104360, "step": 893, "time_per_iteration": 2.8618266582489014 }, { "auxiliary_loss_clip": 0.01192677, "auxiliary_loss_mlp": 0.01063531, "balance_loss_clip": 1.0572021, "balance_loss_mlp": 1.03850877, "epoch": 0.053750187885164585, "flos": 28913153304960.0, "grad_norm": 2.0228714136718122, "language_loss": 0.82052565, "learning_rate": 3.994086432835114e-06, "loss": 0.84308773, "num_input_tokens_seen": 19124680, "step": 894, "time_per_iteration": 2.8347885608673096 }, { "auxiliary_loss_clip": 0.0120111, "auxiliary_loss_mlp": 0.01065233, "balance_loss_clip": 1.0570271, "balance_loss_mlp": 1.03997254, "epoch": 0.05381031113783256, "flos": 15158828476800.0, "grad_norm": 2.260594705980758, "language_loss": 0.76133072, "learning_rate": 3.994056467679221e-06, "loss": 0.78399414, "num_input_tokens_seen": 19142895, "step": 895, "time_per_iteration": 2.7288858890533447 }, { "auxiliary_loss_clip": 0.01200143, "auxiliary_loss_mlp": 0.01060588, "balance_loss_clip": 1.06422663, "balance_loss_mlp": 1.03547084, "epoch": 0.05387043439050053, "flos": 21835232288640.0, "grad_norm": 2.0450623179174974, "language_loss": 0.86767507, "learning_rate": 3.9940264269085065e-06, "loss": 0.89028239, "num_input_tokens_seen": 19163125, "step": 896, "time_per_iteration": 4.404265642166138 }, { "auxiliary_loss_clip": 0.0122203, "auxiliary_loss_mlp": 0.00782931, "balance_loss_clip": 1.06062579, "balance_loss_mlp": 1.0002867, "epoch": 0.053930557643168495, "flos": 17310308382720.0, "grad_norm": 3.0866230440609805, "language_loss": 0.8797363, "learning_rate": 3.9939963105241115e-06, "loss": 0.89978594, "num_input_tokens_seen": 19179385, "step": 897, "time_per_iteration": 4.843130588531494 }, { "auxiliary_loss_clip": 0.01201639, "auxiliary_loss_mlp": 0.01063724, "balance_loss_clip": 1.05896854, "balance_loss_mlp": 1.03658032, "epoch": 0.05399068089583647, "flos": 17348481561600.0, "grad_norm": 1.8270040910241792, "language_loss": 0.90170419, "learning_rate": 3.993966118527175e-06, "loss": 0.92435783, "num_input_tokens_seen": 19198725, "step": 898, "time_per_iteration": 2.695235252380371 }, { "auxiliary_loss_clip": 0.01200189, "auxiliary_loss_mlp": 0.01076438, "balance_loss_clip": 1.05787873, "balance_loss_mlp": 1.05105805, "epoch": 0.05405080414850443, "flos": 17486952491520.0, "grad_norm": 2.793625116693953, "language_loss": 0.91544139, "learning_rate": 3.993935850918845e-06, "loss": 0.93820769, "num_input_tokens_seen": 19212380, "step": 899, "time_per_iteration": 2.7509548664093018 }, { "auxiliary_loss_clip": 0.01186479, "auxiliary_loss_mlp": 0.01068594, "balance_loss_clip": 1.05614042, "balance_loss_mlp": 1.04154527, "epoch": 0.054110927401172404, "flos": 24496787278080.0, "grad_norm": 1.983572968760697, "language_loss": 0.75742769, "learning_rate": 3.9939055077002665e-06, "loss": 0.77997845, "num_input_tokens_seen": 19232235, "step": 900, "time_per_iteration": 2.771371364593506 }, { "auxiliary_loss_clip": 0.01211506, "auxiliary_loss_mlp": 0.01058176, "balance_loss_clip": 1.05839145, "balance_loss_mlp": 1.03401244, "epoch": 0.054171050653840376, "flos": 22930040874240.0, "grad_norm": 2.192527627735503, "language_loss": 0.74331856, "learning_rate": 3.993875088872592e-06, "loss": 0.76601535, "num_input_tokens_seen": 19251460, "step": 901, "time_per_iteration": 2.859912157058716 }, { "auxiliary_loss_clip": 0.01177502, "auxiliary_loss_mlp": 0.01065445, "balance_loss_clip": 1.0569309, "balance_loss_mlp": 1.04166329, "epoch": 0.05423117390650834, "flos": 12933192942720.0, "grad_norm": 2.352700712836257, "language_loss": 0.85287452, "learning_rate": 3.9938445944369745e-06, "loss": 0.87530404, "num_input_tokens_seen": 19269060, "step": 902, "time_per_iteration": 2.7940642833709717 }, { "auxiliary_loss_clip": 0.01161069, "auxiliary_loss_mlp": 0.01066664, "balance_loss_clip": 1.04903233, "balance_loss_mlp": 1.04112983, "epoch": 0.05429129715917631, "flos": 19901335017600.0, "grad_norm": 1.9620711230312637, "language_loss": 0.86385572, "learning_rate": 3.993814024394569e-06, "loss": 0.88613302, "num_input_tokens_seen": 19288620, "step": 903, "time_per_iteration": 2.9258980751037598 }, { "auxiliary_loss_clip": 0.0121005, "auxiliary_loss_mlp": 0.01059616, "balance_loss_clip": 1.06094384, "balance_loss_mlp": 1.03534508, "epoch": 0.05435142041184428, "flos": 16908611610240.0, "grad_norm": 2.175127974944855, "language_loss": 0.74927866, "learning_rate": 3.993783378746537e-06, "loss": 0.7719754, "num_input_tokens_seen": 19306615, "step": 904, "time_per_iteration": 2.7239954471588135 }, { "auxiliary_loss_clip": 0.01208402, "auxiliary_loss_mlp": 0.01067543, "balance_loss_clip": 1.06052148, "balance_loss_mlp": 1.04325962, "epoch": 0.05441154366451225, "flos": 23948323534080.0, "grad_norm": 2.5191963984804535, "language_loss": 0.85946918, "learning_rate": 3.993752657494039e-06, "loss": 0.88222867, "num_input_tokens_seen": 19321680, "step": 905, "time_per_iteration": 2.693896532058716 }, { "auxiliary_loss_clip": 0.01198232, "auxiliary_loss_mlp": 0.01078072, "balance_loss_clip": 1.06483209, "balance_loss_mlp": 1.05400348, "epoch": 0.05447166691718022, "flos": 19975382904960.0, "grad_norm": 1.7753581401878566, "language_loss": 0.74413162, "learning_rate": 3.993721860638241e-06, "loss": 0.7668947, "num_input_tokens_seen": 19339760, "step": 906, "time_per_iteration": 2.6679019927978516 }, { "auxiliary_loss_clip": 0.01192373, "auxiliary_loss_mlp": 0.01064381, "balance_loss_clip": 1.05954027, "balance_loss_mlp": 1.0397284, "epoch": 0.05453179016984819, "flos": 24936513575040.0, "grad_norm": 2.3037248114268896, "language_loss": 0.87340188, "learning_rate": 3.993690988180309e-06, "loss": 0.89596951, "num_input_tokens_seen": 19359585, "step": 907, "time_per_iteration": 2.7363240718841553 }, { "auxiliary_loss_clip": 0.01205519, "auxiliary_loss_mlp": 0.01068463, "balance_loss_clip": 1.0616293, "balance_loss_mlp": 1.04332149, "epoch": 0.05459191342251616, "flos": 18115102558080.0, "grad_norm": 1.6666873589767146, "language_loss": 0.86928803, "learning_rate": 3.9936600401214165e-06, "loss": 0.89202785, "num_input_tokens_seen": 19378590, "step": 908, "time_per_iteration": 2.6266026496887207 }, { "auxiliary_loss_clip": 0.01198848, "auxiliary_loss_mlp": 0.01067336, "balance_loss_clip": 1.05974221, "balance_loss_mlp": 1.04107404, "epoch": 0.054652036675184125, "flos": 19208295031680.0, "grad_norm": 2.1282794409977215, "language_loss": 0.89792144, "learning_rate": 3.9936290164627345e-06, "loss": 0.92058325, "num_input_tokens_seen": 19397910, "step": 909, "time_per_iteration": 2.7163166999816895 }, { "auxiliary_loss_clip": 0.01200393, "auxiliary_loss_mlp": 0.01073374, "balance_loss_clip": 1.06157839, "balance_loss_mlp": 1.04742169, "epoch": 0.0547121599278521, "flos": 16325745615360.0, "grad_norm": 2.095924869989121, "language_loss": 0.70949811, "learning_rate": 3.99359791720544e-06, "loss": 0.73223579, "num_input_tokens_seen": 19415950, "step": 910, "time_per_iteration": 2.6697354316711426 }, { "auxiliary_loss_clip": 0.01187784, "auxiliary_loss_mlp": 0.01054671, "balance_loss_clip": 1.05651259, "balance_loss_mlp": 1.02975583, "epoch": 0.05477228318052007, "flos": 20339014239360.0, "grad_norm": 1.6633724338567386, "language_loss": 0.83651805, "learning_rate": 3.993566742350714e-06, "loss": 0.85894263, "num_input_tokens_seen": 19435275, "step": 911, "time_per_iteration": 2.692798137664795 }, { "auxiliary_loss_clip": 0.01187113, "auxiliary_loss_mlp": 0.01073028, "balance_loss_clip": 1.05334687, "balance_loss_mlp": 1.04719508, "epoch": 0.054832406433188034, "flos": 21973092687360.0, "grad_norm": 2.283907419545301, "language_loss": 0.76320881, "learning_rate": 3.993535491899736e-06, "loss": 0.78581023, "num_input_tokens_seen": 19452090, "step": 912, "time_per_iteration": 2.6653189659118652 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.01051652, "balance_loss_clip": 1.05707574, "balance_loss_mlp": 1.0271548, "epoch": 0.054892529685856006, "flos": 16398931576320.0, "grad_norm": 2.366460016615147, "language_loss": 0.82826668, "learning_rate": 3.993504165853694e-06, "loss": 0.85064626, "num_input_tokens_seen": 19470865, "step": 913, "time_per_iteration": 2.6826348304748535 }, { "auxiliary_loss_clip": 0.01194515, "auxiliary_loss_mlp": 0.01060483, "balance_loss_clip": 1.0581125, "balance_loss_mlp": 1.03651023, "epoch": 0.05495265293852397, "flos": 23912341084800.0, "grad_norm": 3.3338391252510586, "language_loss": 0.8373239, "learning_rate": 3.993472764213772e-06, "loss": 0.85987389, "num_input_tokens_seen": 19492145, "step": 914, "time_per_iteration": 2.7358829975128174 }, { "auxiliary_loss_clip": 0.0120705, "auxiliary_loss_mlp": 0.0078227, "balance_loss_clip": 1.06039774, "balance_loss_mlp": 1.00027478, "epoch": 0.055012776191191944, "flos": 23586954756480.0, "grad_norm": 2.520244909384168, "language_loss": 0.90146536, "learning_rate": 3.9934412869811655e-06, "loss": 0.92135859, "num_input_tokens_seen": 19511015, "step": 915, "time_per_iteration": 2.9398341178894043 }, { "auxiliary_loss_clip": 0.01201461, "auxiliary_loss_mlp": 0.01059252, "balance_loss_clip": 1.06274199, "balance_loss_mlp": 1.03558862, "epoch": 0.055072899443859916, "flos": 17528501548800.0, "grad_norm": 2.182721785653499, "language_loss": 0.89710975, "learning_rate": 3.993409734157064e-06, "loss": 0.91971689, "num_input_tokens_seen": 19529040, "step": 916, "time_per_iteration": 2.7210159301757812 }, { "auxiliary_loss_clip": 0.01175226, "auxiliary_loss_mlp": 0.01066073, "balance_loss_clip": 1.05741024, "balance_loss_mlp": 1.04103947, "epoch": 0.05513302269652788, "flos": 21687172427520.0, "grad_norm": 1.7899379897310368, "language_loss": 0.8016991, "learning_rate": 3.993378105742666e-06, "loss": 0.82411212, "num_input_tokens_seen": 19549540, "step": 917, "time_per_iteration": 2.7923104763031006 }, { "auxiliary_loss_clip": 0.01139072, "auxiliary_loss_mlp": 0.0105947, "balance_loss_clip": 1.05135942, "balance_loss_mlp": 1.03414989, "epoch": 0.05519314594919585, "flos": 21613340021760.0, "grad_norm": 2.106744179667805, "language_loss": 0.79437333, "learning_rate": 3.9933464017391705e-06, "loss": 0.81635869, "num_input_tokens_seen": 19567570, "step": 918, "time_per_iteration": 2.8051092624664307 }, { "auxiliary_loss_clip": 0.01196947, "auxiliary_loss_mlp": 0.01055679, "balance_loss_clip": 1.05616307, "balance_loss_mlp": 1.03166997, "epoch": 0.05525326920186382, "flos": 21798567480960.0, "grad_norm": 2.454030193031321, "language_loss": 0.89019686, "learning_rate": 3.99331462214778e-06, "loss": 0.91272312, "num_input_tokens_seen": 19585330, "step": 919, "time_per_iteration": 2.6846773624420166 }, { "auxiliary_loss_clip": 0.01213326, "auxiliary_loss_mlp": 0.01069349, "balance_loss_clip": 1.05950904, "balance_loss_mlp": 1.04417229, "epoch": 0.05531339245453179, "flos": 28439635288320.0, "grad_norm": 2.246354931091656, "language_loss": 0.8746047, "learning_rate": 3.993282766969699e-06, "loss": 0.89743137, "num_input_tokens_seen": 19604970, "step": 920, "time_per_iteration": 2.6699845790863037 }, { "auxiliary_loss_clip": 0.01190424, "auxiliary_loss_mlp": 0.0106036, "balance_loss_clip": 1.06023288, "balance_loss_mlp": 1.03657782, "epoch": 0.05537351570719976, "flos": 37375143131520.0, "grad_norm": 1.975714125194334, "language_loss": 0.6568011, "learning_rate": 3.993250836206136e-06, "loss": 0.67930895, "num_input_tokens_seen": 19626235, "step": 921, "time_per_iteration": 2.833644390106201 }, { "auxiliary_loss_clip": 0.01209678, "auxiliary_loss_mlp": 0.01065483, "balance_loss_clip": 1.06060767, "balance_loss_mlp": 1.03874445, "epoch": 0.05543363895986773, "flos": 20084479488000.0, "grad_norm": 1.7242493696651606, "language_loss": 0.71861136, "learning_rate": 3.993218829858301e-06, "loss": 0.74136293, "num_input_tokens_seen": 19644305, "step": 922, "time_per_iteration": 2.6168808937072754 }, { "auxiliary_loss_clip": 0.01187138, "auxiliary_loss_mlp": 0.01067213, "balance_loss_clip": 1.05423355, "balance_loss_mlp": 1.04223895, "epoch": 0.0554937622125357, "flos": 24533200690560.0, "grad_norm": 2.6848185900705412, "language_loss": 0.82304025, "learning_rate": 3.993186747927408e-06, "loss": 0.8455838, "num_input_tokens_seen": 19662130, "step": 923, "time_per_iteration": 2.7298316955566406 }, { "auxiliary_loss_clip": 0.01202941, "auxiliary_loss_mlp": 0.01064106, "balance_loss_clip": 1.05725455, "balance_loss_mlp": 1.03933442, "epoch": 0.055553885465203665, "flos": 14320063013760.0, "grad_norm": 1.9334372940525173, "language_loss": 0.78759122, "learning_rate": 3.993154590414675e-06, "loss": 0.81026167, "num_input_tokens_seen": 19680715, "step": 924, "time_per_iteration": 2.6869630813598633 }, { "auxiliary_loss_clip": 0.0116422, "auxiliary_loss_mlp": 0.01053758, "balance_loss_clip": 1.05395627, "balance_loss_mlp": 1.02844954, "epoch": 0.05561400871787164, "flos": 27381132374400.0, "grad_norm": 2.005203138116014, "language_loss": 1.02005315, "learning_rate": 3.993122357321319e-06, "loss": 1.04223299, "num_input_tokens_seen": 19700535, "step": 925, "time_per_iteration": 2.716089963912964 }, { "auxiliary_loss_clip": 0.01163201, "auxiliary_loss_mlp": 0.01052104, "balance_loss_clip": 1.05070591, "balance_loss_mlp": 1.02739179, "epoch": 0.05567413197053961, "flos": 23221096778880.0, "grad_norm": 2.0106641835017482, "language_loss": 0.80939209, "learning_rate": 3.993090048648564e-06, "loss": 0.83154511, "num_input_tokens_seen": 19718825, "step": 926, "time_per_iteration": 2.895803451538086 }, { "auxiliary_loss_clip": 0.01207515, "auxiliary_loss_mlp": 0.01068168, "balance_loss_clip": 1.05892682, "balance_loss_mlp": 1.0419066, "epoch": 0.055734255223207574, "flos": 25264952559360.0, "grad_norm": 2.9732625845644045, "language_loss": 0.73220479, "learning_rate": 3.993057664397634e-06, "loss": 0.75496161, "num_input_tokens_seen": 19739080, "step": 927, "time_per_iteration": 2.677725076675415 }, { "auxiliary_loss_clip": 0.01101002, "auxiliary_loss_mlp": 0.01015011, "balance_loss_clip": 1.02922702, "balance_loss_mlp": 1.01014709, "epoch": 0.055794378475875546, "flos": 66503116702080.0, "grad_norm": 0.8406874373244947, "language_loss": 0.59841412, "learning_rate": 3.9930252045697585e-06, "loss": 0.61957431, "num_input_tokens_seen": 19802960, "step": 928, "time_per_iteration": 3.187382221221924 }, { "auxiliary_loss_clip": 0.01202438, "auxiliary_loss_mlp": 0.01065066, "balance_loss_clip": 1.05921853, "balance_loss_mlp": 1.04070008, "epoch": 0.05585450172854351, "flos": 25337635729920.0, "grad_norm": 2.0668361967965994, "language_loss": 0.95411372, "learning_rate": 3.992992669166168e-06, "loss": 0.97678876, "num_input_tokens_seen": 19822765, "step": 929, "time_per_iteration": 2.6930506229400635 }, { "auxiliary_loss_clip": 0.01171806, "auxiliary_loss_mlp": 0.01068051, "balance_loss_clip": 1.05343258, "balance_loss_mlp": 1.04101443, "epoch": 0.05591462498121148, "flos": 33911738881920.0, "grad_norm": 2.1442452677256627, "language_loss": 0.71756601, "learning_rate": 3.992960058188094e-06, "loss": 0.7399646, "num_input_tokens_seen": 19843590, "step": 930, "time_per_iteration": 2.803219795227051 }, { "auxiliary_loss_clip": 0.01188277, "auxiliary_loss_mlp": 0.01058888, "balance_loss_clip": 1.05783677, "balance_loss_mlp": 1.03377056, "epoch": 0.055974748233879455, "flos": 17930880679680.0, "grad_norm": 2.381261552273062, "language_loss": 0.85279298, "learning_rate": 3.992927371636776e-06, "loss": 0.87526459, "num_input_tokens_seen": 19860230, "step": 931, "time_per_iteration": 2.6215872764587402 }, { "auxiliary_loss_clip": 0.01203533, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.05677414, "balance_loss_mlp": 1.00025761, "epoch": 0.05603487148654742, "flos": 24021976371840.0, "grad_norm": 2.2861197477099973, "language_loss": 0.83645165, "learning_rate": 3.9928946095134525e-06, "loss": 0.85631776, "num_input_tokens_seen": 19880795, "step": 932, "time_per_iteration": 2.664062261581421 }, { "auxiliary_loss_clip": 0.01200637, "auxiliary_loss_mlp": 0.0107041, "balance_loss_clip": 1.05897784, "balance_loss_mlp": 1.04407716, "epoch": 0.05609499473921539, "flos": 17307758517120.0, "grad_norm": 1.8036739452122519, "language_loss": 0.73694205, "learning_rate": 3.992861771819365e-06, "loss": 0.7596525, "num_input_tokens_seen": 19897960, "step": 933, "time_per_iteration": 2.631620168685913 }, { "auxiliary_loss_clip": 0.01153445, "auxiliary_loss_mlp": 0.01076903, "balance_loss_clip": 1.04885209, "balance_loss_mlp": 1.05060577, "epoch": 0.05615511799188336, "flos": 20994742972800.0, "grad_norm": 2.385249039382274, "language_loss": 0.86660421, "learning_rate": 3.99282885855576e-06, "loss": 0.88890779, "num_input_tokens_seen": 19913315, "step": 934, "time_per_iteration": 2.7739439010620117 }, { "auxiliary_loss_clip": 0.01164295, "auxiliary_loss_mlp": 0.0108083, "balance_loss_clip": 1.05509257, "balance_loss_mlp": 1.0557723, "epoch": 0.05621524124455133, "flos": 17273535834240.0, "grad_norm": 2.2740258482680433, "language_loss": 0.80388415, "learning_rate": 3.992795869723885e-06, "loss": 0.82633543, "num_input_tokens_seen": 19928790, "step": 935, "time_per_iteration": 5.93512487411499 }, { "auxiliary_loss_clip": 0.01093927, "auxiliary_loss_mlp": 0.01019701, "balance_loss_clip": 1.02288604, "balance_loss_mlp": 1.01540911, "epoch": 0.0562753644972193, "flos": 58719370458240.0, "grad_norm": 0.820561718243334, "language_loss": 0.69191676, "learning_rate": 3.99276280532499e-06, "loss": 0.71305299, "num_input_tokens_seen": 19988785, "step": 936, "time_per_iteration": 4.862478733062744 }, { "auxiliary_loss_clip": 0.01213648, "auxiliary_loss_mlp": 0.01068507, "balance_loss_clip": 1.05806684, "balance_loss_mlp": 1.04429567, "epoch": 0.05633548774988727, "flos": 17457039440640.0, "grad_norm": 1.9573264311231433, "language_loss": 0.7572521, "learning_rate": 3.992729665360331e-06, "loss": 0.78007358, "num_input_tokens_seen": 20007685, "step": 937, "time_per_iteration": 4.219425916671753 }, { "auxiliary_loss_clip": 0.01085529, "auxiliary_loss_mlp": 0.01013805, "balance_loss_clip": 1.02476001, "balance_loss_mlp": 1.00944233, "epoch": 0.05639561100255524, "flos": 70654928083200.0, "grad_norm": 0.9053055994078011, "language_loss": 0.64309287, "learning_rate": 3.992696449831162e-06, "loss": 0.66408622, "num_input_tokens_seen": 20072750, "step": 938, "time_per_iteration": 3.1298794746398926 }, { "auxiliary_loss_clip": 0.01171203, "auxiliary_loss_mlp": 0.01068815, "balance_loss_clip": 1.05175185, "balance_loss_mlp": 1.0426966, "epoch": 0.056455734255223204, "flos": 20485996692480.0, "grad_norm": 2.7427540631348832, "language_loss": 0.79751205, "learning_rate": 3.992663158738745e-06, "loss": 0.8199122, "num_input_tokens_seen": 20089070, "step": 939, "time_per_iteration": 2.6863484382629395 }, { "auxiliary_loss_clip": 0.01175528, "auxiliary_loss_mlp": 0.01068297, "balance_loss_clip": 1.0509069, "balance_loss_mlp": 1.04338217, "epoch": 0.056515857507891176, "flos": 22053569109120.0, "grad_norm": 1.8374791395473227, "language_loss": 0.73919088, "learning_rate": 3.992629792084341e-06, "loss": 0.76162916, "num_input_tokens_seen": 20108790, "step": 940, "time_per_iteration": 2.7111120223999023 }, { "auxiliary_loss_clip": 0.01198483, "auxiliary_loss_mlp": 0.01058511, "balance_loss_clip": 1.05900669, "balance_loss_mlp": 1.03252339, "epoch": 0.05657598076055915, "flos": 24025316336640.0, "grad_norm": 2.2993716569389813, "language_loss": 0.70622003, "learning_rate": 3.992596349869216e-06, "loss": 0.72878999, "num_input_tokens_seen": 20128455, "step": 941, "time_per_iteration": 2.657594680786133 }, { "auxiliary_loss_clip": 0.01135396, "auxiliary_loss_mlp": 0.01059543, "balance_loss_clip": 1.04961574, "balance_loss_mlp": 1.03382993, "epoch": 0.05663610401322711, "flos": 20480609652480.0, "grad_norm": 2.0678542992190847, "language_loss": 0.80921417, "learning_rate": 3.992562832094637e-06, "loss": 0.83116359, "num_input_tokens_seen": 20145775, "step": 942, "time_per_iteration": 2.7379891872406006 }, { "auxiliary_loss_clip": 0.01186767, "auxiliary_loss_mlp": 0.01062055, "balance_loss_clip": 1.05228579, "balance_loss_mlp": 1.03554332, "epoch": 0.056696227265895086, "flos": 21069042255360.0, "grad_norm": 2.245249922529115, "language_loss": 0.88858449, "learning_rate": 3.9925292387618755e-06, "loss": 0.91107273, "num_input_tokens_seen": 20164315, "step": 943, "time_per_iteration": 2.6502583026885986 }, { "auxiliary_loss_clip": 0.01199122, "auxiliary_loss_mlp": 0.0105963, "balance_loss_clip": 1.05991781, "balance_loss_mlp": 1.03534663, "epoch": 0.05675635051856306, "flos": 17821317219840.0, "grad_norm": 2.5514256959015995, "language_loss": 0.74771839, "learning_rate": 3.992495569872206e-06, "loss": 0.77030593, "num_input_tokens_seen": 20182760, "step": 944, "time_per_iteration": 2.676079034805298 }, { "auxiliary_loss_clip": 0.01204502, "auxiliary_loss_mlp": 0.01064591, "balance_loss_clip": 1.05980551, "balance_loss_mlp": 1.04085672, "epoch": 0.05681647377123102, "flos": 23114945111040.0, "grad_norm": 1.5959266123312272, "language_loss": 0.79406166, "learning_rate": 3.992461825426906e-06, "loss": 0.81675267, "num_input_tokens_seen": 20203830, "step": 945, "time_per_iteration": 2.734299421310425 }, { "auxiliary_loss_clip": 0.01195984, "auxiliary_loss_mlp": 0.0105672, "balance_loss_clip": 1.05686593, "balance_loss_mlp": 1.03156662, "epoch": 0.056876597023898995, "flos": 16070528505600.0, "grad_norm": 2.5637081249861824, "language_loss": 0.82651746, "learning_rate": 3.992428005427252e-06, "loss": 0.84904456, "num_input_tokens_seen": 20220365, "step": 946, "time_per_iteration": 2.6636929512023926 }, { "auxiliary_loss_clip": 0.0122014, "auxiliary_loss_mlp": 0.01061449, "balance_loss_clip": 1.06224144, "balance_loss_mlp": 1.03524721, "epoch": 0.05693672027656696, "flos": 16835641130880.0, "grad_norm": 1.8433174156507384, "language_loss": 0.79031301, "learning_rate": 3.992394109874529e-06, "loss": 0.81312895, "num_input_tokens_seen": 20238640, "step": 947, "time_per_iteration": 2.623671293258667 }, { "auxiliary_loss_clip": 0.0117587, "auxiliary_loss_mlp": 0.01061489, "balance_loss_clip": 1.05605412, "balance_loss_mlp": 1.03569245, "epoch": 0.05699684352923493, "flos": 21389113370880.0, "grad_norm": 6.8661947111986725, "language_loss": 0.85425055, "learning_rate": 3.9923601387700225e-06, "loss": 0.87662411, "num_input_tokens_seen": 20251025, "step": 948, "time_per_iteration": 2.7410409450531006 }, { "auxiliary_loss_clip": 0.01214005, "auxiliary_loss_mlp": 0.01063231, "balance_loss_clip": 1.05969238, "balance_loss_mlp": 1.03598022, "epoch": 0.057056966781902904, "flos": 15560309767680.0, "grad_norm": 3.649211317819821, "language_loss": 0.87346625, "learning_rate": 3.992326092115019e-06, "loss": 0.89623863, "num_input_tokens_seen": 20269775, "step": 949, "time_per_iteration": 2.6893157958984375 }, { "auxiliary_loss_clip": 0.01194543, "auxiliary_loss_mlp": 0.0106695, "balance_loss_clip": 1.05799937, "balance_loss_mlp": 1.04266715, "epoch": 0.05711709003457087, "flos": 19937856170880.0, "grad_norm": 1.8324883776363103, "language_loss": 0.7874645, "learning_rate": 3.992291969910811e-06, "loss": 0.8100794, "num_input_tokens_seen": 20287715, "step": 950, "time_per_iteration": 2.623924732208252 }, { "auxiliary_loss_clip": 0.01180518, "auxiliary_loss_mlp": 0.01068771, "balance_loss_clip": 1.05322623, "balance_loss_mlp": 1.04384422, "epoch": 0.05717721328723884, "flos": 30332701774080.0, "grad_norm": 3.8045132244795816, "language_loss": 0.82477522, "learning_rate": 3.992257772158691e-06, "loss": 0.8472681, "num_input_tokens_seen": 20307070, "step": 951, "time_per_iteration": 2.697479724884033 }, { "auxiliary_loss_clip": 0.01167302, "auxiliary_loss_mlp": 0.01061039, "balance_loss_clip": 1.04906607, "balance_loss_mlp": 1.03375173, "epoch": 0.05723733653990681, "flos": 23654358627840.0, "grad_norm": 2.4180383362968634, "language_loss": 0.86899263, "learning_rate": 3.992223498859958e-06, "loss": 0.89127606, "num_input_tokens_seen": 20324945, "step": 952, "time_per_iteration": 2.707716226577759 }, { "auxiliary_loss_clip": 0.01191405, "auxiliary_loss_mlp": 0.01064705, "balance_loss_clip": 1.05511189, "balance_loss_mlp": 1.03630924, "epoch": 0.05729745979257478, "flos": 22055759838720.0, "grad_norm": 2.195434645270168, "language_loss": 0.79087842, "learning_rate": 3.9921891500159084e-06, "loss": 0.81343949, "num_input_tokens_seen": 20346135, "step": 953, "time_per_iteration": 2.671255588531494 }, { "auxiliary_loss_clip": 0.01190026, "auxiliary_loss_mlp": 0.01066447, "balance_loss_clip": 1.05984342, "balance_loss_mlp": 1.04056656, "epoch": 0.05735758304524275, "flos": 19604353368960.0, "grad_norm": 2.2066085695914466, "language_loss": 0.86644447, "learning_rate": 3.992154725627848e-06, "loss": 0.88900924, "num_input_tokens_seen": 20364450, "step": 954, "time_per_iteration": 2.671657085418701 }, { "auxiliary_loss_clip": 0.01210569, "auxiliary_loss_mlp": 0.01062619, "balance_loss_clip": 1.06119955, "balance_loss_mlp": 1.03723955, "epoch": 0.057417706297910716, "flos": 19099018880640.0, "grad_norm": 2.2872795023766113, "language_loss": 0.88071024, "learning_rate": 3.9921202256970804e-06, "loss": 0.90344214, "num_input_tokens_seen": 20383500, "step": 955, "time_per_iteration": 2.69960880279541 }, { "auxiliary_loss_clip": 0.01179864, "auxiliary_loss_mlp": 0.01068889, "balance_loss_clip": 1.0523231, "balance_loss_mlp": 1.04209054, "epoch": 0.05747782955057869, "flos": 16654507822080.0, "grad_norm": 1.9113555723128555, "language_loss": 0.89160776, "learning_rate": 3.992085650224914e-06, "loss": 0.91409534, "num_input_tokens_seen": 20400295, "step": 956, "time_per_iteration": 2.667868137359619 }, { "auxiliary_loss_clip": 0.01167867, "auxiliary_loss_mlp": 0.01060669, "balance_loss_clip": 1.05720079, "balance_loss_mlp": 1.03450251, "epoch": 0.05753795280324665, "flos": 14502058248960.0, "grad_norm": 3.2877973901728095, "language_loss": 0.75473189, "learning_rate": 3.99205099921266e-06, "loss": 0.77701724, "num_input_tokens_seen": 20419085, "step": 957, "time_per_iteration": 2.6938796043395996 }, { "auxiliary_loss_clip": 0.0117627, "auxiliary_loss_mlp": 0.01072849, "balance_loss_clip": 1.05432248, "balance_loss_mlp": 1.0448705, "epoch": 0.057598076055914625, "flos": 18076318848000.0, "grad_norm": 2.0004055711005257, "language_loss": 0.79582155, "learning_rate": 3.992016272661633e-06, "loss": 0.81831264, "num_input_tokens_seen": 20437465, "step": 958, "time_per_iteration": 2.6933834552764893 }, { "auxiliary_loss_clip": 0.01186244, "auxiliary_loss_mlp": 0.01059908, "balance_loss_clip": 1.05851364, "balance_loss_mlp": 1.03572011, "epoch": 0.0576581993085826, "flos": 22124600254080.0, "grad_norm": 2.669863855173802, "language_loss": 0.8840394, "learning_rate": 3.99198147057315e-06, "loss": 0.906501, "num_input_tokens_seen": 20456235, "step": 959, "time_per_iteration": 2.7094578742980957 }, { "auxiliary_loss_clip": 0.01169479, "auxiliary_loss_mlp": 0.01063656, "balance_loss_clip": 1.05511999, "balance_loss_mlp": 1.03881276, "epoch": 0.05771832256125056, "flos": 33181746779520.0, "grad_norm": 2.0960373333994764, "language_loss": 0.78850955, "learning_rate": 3.991946592948529e-06, "loss": 0.8108409, "num_input_tokens_seen": 20476825, "step": 960, "time_per_iteration": 2.822922945022583 }, { "auxiliary_loss_clip": 0.0113413, "auxiliary_loss_mlp": 0.01067189, "balance_loss_clip": 1.05177355, "balance_loss_mlp": 1.04020023, "epoch": 0.057778445813918534, "flos": 24170143973760.0, "grad_norm": 2.063464892179025, "language_loss": 0.92986894, "learning_rate": 3.991911639789094e-06, "loss": 0.95188212, "num_input_tokens_seen": 20496965, "step": 961, "time_per_iteration": 2.793952226638794 }, { "auxiliary_loss_clip": 0.01182535, "auxiliary_loss_mlp": 0.0106764, "balance_loss_clip": 1.0554297, "balance_loss_mlp": 1.04091299, "epoch": 0.0578385690665865, "flos": 29643037666560.0, "grad_norm": 2.0649993155313067, "language_loss": 0.68164188, "learning_rate": 3.991876611096169e-06, "loss": 0.70414358, "num_input_tokens_seen": 20518035, "step": 962, "time_per_iteration": 2.8396694660186768 }, { "auxiliary_loss_clip": 0.01159524, "auxiliary_loss_mlp": 0.01073851, "balance_loss_clip": 1.05128908, "balance_loss_mlp": 1.04909074, "epoch": 0.05789869231925447, "flos": 20885430908160.0, "grad_norm": 2.2685465488517074, "language_loss": 0.8848027, "learning_rate": 3.991841506871084e-06, "loss": 0.90713644, "num_input_tokens_seen": 20534740, "step": 963, "time_per_iteration": 2.7077019214630127 }, { "auxiliary_loss_clip": 0.01183778, "auxiliary_loss_mlp": 0.01061251, "balance_loss_clip": 1.06018209, "balance_loss_mlp": 1.03516829, "epoch": 0.057958815571922444, "flos": 26031106679040.0, "grad_norm": 2.392959969035536, "language_loss": 0.85288298, "learning_rate": 3.99180632711517e-06, "loss": 0.87533331, "num_input_tokens_seen": 20553485, "step": 964, "time_per_iteration": 2.7218217849731445 }, { "auxiliary_loss_clip": 0.01188683, "auxiliary_loss_mlp": 0.01069422, "balance_loss_clip": 1.05959499, "balance_loss_mlp": 1.04325557, "epoch": 0.05801893882459041, "flos": 18077683564800.0, "grad_norm": 3.087349735715565, "language_loss": 0.78159416, "learning_rate": 3.99177107182976e-06, "loss": 0.80417526, "num_input_tokens_seen": 20572155, "step": 965, "time_per_iteration": 2.6902661323547363 }, { "auxiliary_loss_clip": 0.01156531, "auxiliary_loss_mlp": 0.0107109, "balance_loss_clip": 1.0523715, "balance_loss_mlp": 1.04462528, "epoch": 0.05807906207725838, "flos": 17748885444480.0, "grad_norm": 1.9742288518319486, "language_loss": 0.81403655, "learning_rate": 3.99173574101619e-06, "loss": 0.83631277, "num_input_tokens_seen": 20590395, "step": 966, "time_per_iteration": 2.7423267364501953 }, { "auxiliary_loss_clip": 0.01198908, "auxiliary_loss_mlp": 0.01065021, "balance_loss_clip": 1.058887, "balance_loss_mlp": 1.04113197, "epoch": 0.058139185329926346, "flos": 18040372312320.0, "grad_norm": 1.8776530142118544, "language_loss": 0.76480806, "learning_rate": 3.9917003346758035e-06, "loss": 0.78744727, "num_input_tokens_seen": 20608435, "step": 967, "time_per_iteration": 2.642885446548462 }, { "auxiliary_loss_clip": 0.01084339, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.02675521, "balance_loss_mlp": 1.0269078, "epoch": 0.05819930858259432, "flos": 62363297485440.0, "grad_norm": 0.985564929959949, "language_loss": 0.57357776, "learning_rate": 3.991664852809939e-06, "loss": 0.59473509, "num_input_tokens_seen": 20668575, "step": 968, "time_per_iteration": 3.1017024517059326 }, { "auxiliary_loss_clip": 0.01188824, "auxiliary_loss_mlp": 0.01057715, "balance_loss_clip": 1.05784404, "balance_loss_mlp": 1.03147697, "epoch": 0.05825943183526229, "flos": 19135360465920.0, "grad_norm": 2.1276337565108485, "language_loss": 0.82286429, "learning_rate": 3.991629295419945e-06, "loss": 0.84532964, "num_input_tokens_seen": 20687355, "step": 969, "time_per_iteration": 2.669055461883545 }, { "auxiliary_loss_clip": 0.01206272, "auxiliary_loss_mlp": 0.00782724, "balance_loss_clip": 1.06255269, "balance_loss_mlp": 1.00024962, "epoch": 0.058319555087930255, "flos": 29022465369600.0, "grad_norm": 7.916507288074279, "language_loss": 0.7803669, "learning_rate": 3.991593662507167e-06, "loss": 0.80025685, "num_input_tokens_seen": 20705710, "step": 970, "time_per_iteration": 2.733030080795288 }, { "auxiliary_loss_clip": 0.01181452, "auxiliary_loss_mlp": 0.01064945, "balance_loss_clip": 1.05691695, "balance_loss_mlp": 1.03887415, "epoch": 0.05837967834059823, "flos": 18879999701760.0, "grad_norm": 3.163102883752813, "language_loss": 0.92229038, "learning_rate": 3.991557954072958e-06, "loss": 0.94475436, "num_input_tokens_seen": 20722405, "step": 971, "time_per_iteration": 2.730377435684204 }, { "auxiliary_loss_clip": 0.01180948, "auxiliary_loss_mlp": 0.01062613, "balance_loss_clip": 1.05320477, "balance_loss_mlp": 1.03722143, "epoch": 0.05843980159326619, "flos": 25703062744320.0, "grad_norm": 1.700187330091603, "language_loss": 0.85959208, "learning_rate": 3.991522170118673e-06, "loss": 0.88202775, "num_input_tokens_seen": 20741480, "step": 972, "time_per_iteration": 2.687185049057007 }, { "auxiliary_loss_clip": 0.0116993, "auxiliary_loss_mlp": 0.01079713, "balance_loss_clip": 1.05714142, "balance_loss_mlp": 1.05601454, "epoch": 0.058499924845934165, "flos": 25552129795200.0, "grad_norm": 2.00599255988541, "language_loss": 0.87503272, "learning_rate": 3.991486310645667e-06, "loss": 0.89752913, "num_input_tokens_seen": 20759685, "step": 973, "time_per_iteration": 2.7166664600372314 }, { "auxiliary_loss_clip": 0.01206524, "auxiliary_loss_mlp": 0.00784111, "balance_loss_clip": 1.06111121, "balance_loss_mlp": 1.00026989, "epoch": 0.05856004809860214, "flos": 16436171001600.0, "grad_norm": 1.879365930358842, "language_loss": 0.74800295, "learning_rate": 3.991450375655301e-06, "loss": 0.76790935, "num_input_tokens_seen": 20778180, "step": 974, "time_per_iteration": 2.713594675064087 }, { "auxiliary_loss_clip": 0.01197101, "auxiliary_loss_mlp": 0.00782207, "balance_loss_clip": 1.059551, "balance_loss_mlp": 1.00025892, "epoch": 0.0586201713512701, "flos": 39458824116480.0, "grad_norm": 1.5923993506380014, "language_loss": 0.76874506, "learning_rate": 3.991414365148936e-06, "loss": 0.78853816, "num_input_tokens_seen": 20802705, "step": 975, "time_per_iteration": 7.600914716720581 }, { "auxiliary_loss_clip": 0.01215491, "auxiliary_loss_mlp": 0.01069506, "balance_loss_clip": 1.06030774, "balance_loss_mlp": 1.0444721, "epoch": 0.058680294603938074, "flos": 23365170230400.0, "grad_norm": 3.6132976830219734, "language_loss": 0.76748288, "learning_rate": 3.99137827912794e-06, "loss": 0.79033279, "num_input_tokens_seen": 20822540, "step": 976, "time_per_iteration": 4.324799537658691 }, { "auxiliary_loss_clip": 0.01176132, "auxiliary_loss_mlp": 0.01077003, "balance_loss_clip": 1.05271626, "balance_loss_mlp": 1.04963279, "epoch": 0.05874041785660604, "flos": 32232017226240.0, "grad_norm": 1.943198757110789, "language_loss": 0.87343585, "learning_rate": 3.991342117593679e-06, "loss": 0.89596725, "num_input_tokens_seen": 20844175, "step": 977, "time_per_iteration": 2.7742488384246826 }, { "auxiliary_loss_clip": 0.01187161, "auxiliary_loss_mlp": 0.01067914, "balance_loss_clip": 1.06209528, "balance_loss_mlp": 1.04231977, "epoch": 0.05880054110927401, "flos": 22310043194880.0, "grad_norm": 1.718987046197629, "language_loss": 0.7969116, "learning_rate": 3.991305880547527e-06, "loss": 0.81946236, "num_input_tokens_seen": 20864730, "step": 978, "time_per_iteration": 2.733372926712036 }, { "auxiliary_loss_clip": 0.01136264, "auxiliary_loss_mlp": 0.01076585, "balance_loss_clip": 1.05591321, "balance_loss_mlp": 1.04927468, "epoch": 0.05886066436194198, "flos": 27380450016000.0, "grad_norm": 1.8692877257975375, "language_loss": 0.80665666, "learning_rate": 3.991269567990855e-06, "loss": 0.82878518, "num_input_tokens_seen": 20885200, "step": 979, "time_per_iteration": 3.2624220848083496 }, { "auxiliary_loss_clip": 0.01074686, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 1.02640033, "balance_loss_mlp": 1.02495658, "epoch": 0.05892078761460995, "flos": 59584493525760.0, "grad_norm": 0.9436493040005753, "language_loss": 0.59004962, "learning_rate": 3.9912331799250415e-06, "loss": 0.6110934, "num_input_tokens_seen": 20940325, "step": 980, "time_per_iteration": 3.4688587188720703 }, { "auxiliary_loss_clip": 0.01211665, "auxiliary_loss_mlp": 0.01078603, "balance_loss_clip": 1.06178868, "balance_loss_mlp": 1.05242431, "epoch": 0.05898091086727792, "flos": 15414081500160.0, "grad_norm": 2.2770545408130514, "language_loss": 0.86436182, "learning_rate": 3.9911967163514665e-06, "loss": 0.88726455, "num_input_tokens_seen": 20958220, "step": 981, "time_per_iteration": 2.5824644565582275 }, { "auxiliary_loss_clip": 0.01190085, "auxiliary_loss_mlp": 0.0106921, "balance_loss_clip": 1.05943286, "balance_loss_mlp": 1.04629803, "epoch": 0.059041034119945886, "flos": 23655328295040.0, "grad_norm": 2.1333982175691855, "language_loss": 0.79293346, "learning_rate": 3.991160177271513e-06, "loss": 0.81552643, "num_input_tokens_seen": 20978920, "step": 982, "time_per_iteration": 2.68428897857666 }, { "auxiliary_loss_clip": 0.01192274, "auxiliary_loss_mlp": 0.01068234, "balance_loss_clip": 1.05926657, "balance_loss_mlp": 1.04356933, "epoch": 0.05910115737261386, "flos": 24754087376640.0, "grad_norm": 2.319627739094249, "language_loss": 0.84413779, "learning_rate": 3.9911235626865654e-06, "loss": 0.86674285, "num_input_tokens_seen": 20999490, "step": 983, "time_per_iteration": 2.7006261348724365 }, { "auxiliary_loss_clip": 0.0120015, "auxiliary_loss_mlp": 0.01072669, "balance_loss_clip": 1.05969584, "balance_loss_mlp": 1.04799283, "epoch": 0.05916128062528183, "flos": 11728749070080.0, "grad_norm": 1.8014395118859294, "language_loss": 0.84510243, "learning_rate": 3.9910868725980125e-06, "loss": 0.86783063, "num_input_tokens_seen": 21017865, "step": 984, "time_per_iteration": 2.640246868133545 }, { "auxiliary_loss_clip": 0.01188594, "auxiliary_loss_mlp": 0.01055296, "balance_loss_clip": 1.05650342, "balance_loss_mlp": 1.03171611, "epoch": 0.059221403877949795, "flos": 21902995296000.0, "grad_norm": 2.473231587287368, "language_loss": 0.77611595, "learning_rate": 3.9910501070072465e-06, "loss": 0.7985549, "num_input_tokens_seen": 21035900, "step": 985, "time_per_iteration": 2.626371383666992 }, { "auxiliary_loss_clip": 0.01150113, "auxiliary_loss_mlp": 0.01060814, "balance_loss_clip": 1.05341148, "balance_loss_mlp": 1.03542209, "epoch": 0.05928152713061777, "flos": 20514580940160.0, "grad_norm": 1.9082382068459252, "language_loss": 0.90593231, "learning_rate": 3.991013265915661e-06, "loss": 0.92804158, "num_input_tokens_seen": 21053235, "step": 986, "time_per_iteration": 2.7834935188293457 }, { "auxiliary_loss_clip": 0.01200704, "auxiliary_loss_mlp": 0.01061312, "balance_loss_clip": 1.05555892, "balance_loss_mlp": 1.03425193, "epoch": 0.05934165038328574, "flos": 24495135252480.0, "grad_norm": 2.216017383423336, "language_loss": 0.75688565, "learning_rate": 3.9909763493246525e-06, "loss": 0.77950585, "num_input_tokens_seen": 21073090, "step": 987, "time_per_iteration": 2.6669981479644775 }, { "auxiliary_loss_clip": 0.01203558, "auxiliary_loss_mlp": 0.01057756, "balance_loss_clip": 1.06134868, "balance_loss_mlp": 1.03331852, "epoch": 0.059401773635953704, "flos": 38728041914880.0, "grad_norm": 2.2869993581633827, "language_loss": 0.71867943, "learning_rate": 3.990939357235621e-06, "loss": 0.7412926, "num_input_tokens_seen": 21094895, "step": 988, "time_per_iteration": 2.805851697921753 }, { "auxiliary_loss_clip": 0.0105006, "auxiliary_loss_mlp": 0.0101134, "balance_loss_clip": 1.02230322, "balance_loss_mlp": 1.00688171, "epoch": 0.059461896888621676, "flos": 58023565125120.0, "grad_norm": 0.9416454944601763, "language_loss": 0.7124939, "learning_rate": 3.99090228964997e-06, "loss": 0.73310792, "num_input_tokens_seen": 21147555, "step": 989, "time_per_iteration": 3.100306749343872 }, { "auxiliary_loss_clip": 0.0117797, "auxiliary_loss_mlp": 0.01072264, "balance_loss_clip": 1.05793095, "balance_loss_mlp": 1.04389191, "epoch": 0.05952202014128964, "flos": 22127760650880.0, "grad_norm": 2.0167260155417113, "language_loss": 0.78245646, "learning_rate": 3.990865146569105e-06, "loss": 0.80495882, "num_input_tokens_seen": 21167845, "step": 990, "time_per_iteration": 2.8133904933929443 }, { "auxiliary_loss_clip": 0.01198295, "auxiliary_loss_mlp": 0.01053485, "balance_loss_clip": 1.06166339, "balance_loss_mlp": 1.02761686, "epoch": 0.059582143393957614, "flos": 20445776438400.0, "grad_norm": 2.2411623387553727, "language_loss": 0.86522102, "learning_rate": 3.990827927994434e-06, "loss": 0.88773882, "num_input_tokens_seen": 21185085, "step": 991, "time_per_iteration": 2.6964831352233887 }, { "auxiliary_loss_clip": 0.0121783, "auxiliary_loss_mlp": 0.01064707, "balance_loss_clip": 1.0613625, "balance_loss_mlp": 1.03943431, "epoch": 0.059642266646625586, "flos": 20594877793920.0, "grad_norm": 1.8566945591898132, "language_loss": 0.76738375, "learning_rate": 3.9907906339273674e-06, "loss": 0.79020917, "num_input_tokens_seen": 21204230, "step": 992, "time_per_iteration": 2.646942377090454 }, { "auxiliary_loss_clip": 0.01146457, "auxiliary_loss_mlp": 0.01062309, "balance_loss_clip": 1.05571234, "balance_loss_mlp": 1.03834832, "epoch": 0.05970238989929355, "flos": 19352655792000.0, "grad_norm": 2.3469050968731233, "language_loss": 0.75117075, "learning_rate": 3.9907532643693215e-06, "loss": 0.77325845, "num_input_tokens_seen": 21222655, "step": 993, "time_per_iteration": 2.7642974853515625 }, { "auxiliary_loss_clip": 0.01157785, "auxiliary_loss_mlp": 0.01075532, "balance_loss_clip": 1.05397618, "balance_loss_mlp": 1.04774487, "epoch": 0.05976251315196152, "flos": 30264040926720.0, "grad_norm": 2.725207959052886, "language_loss": 0.79177904, "learning_rate": 3.990715819321712e-06, "loss": 0.81411219, "num_input_tokens_seen": 21242310, "step": 994, "time_per_iteration": 2.8414714336395264 }, { "auxiliary_loss_clip": 0.01214724, "auxiliary_loss_mlp": 0.01079016, "balance_loss_clip": 1.06264019, "balance_loss_mlp": 1.05361295, "epoch": 0.05982263640462949, "flos": 23185150243200.0, "grad_norm": 2.8097993094234983, "language_loss": 0.79917169, "learning_rate": 3.99067829878596e-06, "loss": 0.82210916, "num_input_tokens_seen": 21261410, "step": 995, "time_per_iteration": 2.6524364948272705 }, { "auxiliary_loss_clip": 0.0116696, "auxiliary_loss_mlp": 0.01068218, "balance_loss_clip": 1.05704355, "balance_loss_mlp": 1.04208767, "epoch": 0.05988275965729746, "flos": 27850879463040.0, "grad_norm": 1.902030256537741, "language_loss": 0.87013257, "learning_rate": 3.990640702763487e-06, "loss": 0.89248431, "num_input_tokens_seen": 21280080, "step": 996, "time_per_iteration": 2.7431676387786865 }, { "auxiliary_loss_clip": 0.01177854, "auxiliary_loss_mlp": 0.01081123, "balance_loss_clip": 1.05684328, "balance_loss_mlp": 1.05055761, "epoch": 0.05994288290996543, "flos": 24680003575680.0, "grad_norm": 2.971039758986745, "language_loss": 0.87273014, "learning_rate": 3.990603031255718e-06, "loss": 0.89531994, "num_input_tokens_seen": 21296765, "step": 997, "time_per_iteration": 2.748448371887207 }, { "auxiliary_loss_clip": 0.01069915, "auxiliary_loss_mlp": 0.01014417, "balance_loss_clip": 1.02303648, "balance_loss_mlp": 1.00972033, "epoch": 0.0600030061626334, "flos": 69929568835200.0, "grad_norm": 1.0091092068179202, "language_loss": 0.75381488, "learning_rate": 3.990565284264083e-06, "loss": 0.7746582, "num_input_tokens_seen": 21363345, "step": 998, "time_per_iteration": 3.2950518131256104 }, { "auxiliary_loss_clip": 0.01170062, "auxiliary_loss_mlp": 0.01065521, "balance_loss_clip": 1.05893683, "balance_loss_mlp": 1.03893745, "epoch": 0.06006312941530137, "flos": 26540140268160.0, "grad_norm": 1.8197691299520968, "language_loss": 0.76053095, "learning_rate": 3.990527461790013e-06, "loss": 0.7828868, "num_input_tokens_seen": 21385290, "step": 999, "time_per_iteration": 2.733802556991577 }, { "auxiliary_loss_clip": 0.01197834, "auxiliary_loss_mlp": 0.01059542, "balance_loss_clip": 1.05646563, "balance_loss_mlp": 1.03339899, "epoch": 0.060123252667969335, "flos": 27344000689920.0, "grad_norm": 2.5948629341774874, "language_loss": 0.82992184, "learning_rate": 3.990489563834943e-06, "loss": 0.85249555, "num_input_tokens_seen": 21407625, "step": 1000, "time_per_iteration": 2.710981845855713 }, { "auxiliary_loss_clip": 0.0118571, "auxiliary_loss_mlp": 0.01062188, "balance_loss_clip": 1.05856955, "balance_loss_mlp": 1.03480577, "epoch": 0.06018337592063731, "flos": 27016710940800.0, "grad_norm": 2.111409807940472, "language_loss": 0.85820085, "learning_rate": 3.990451590400309e-06, "loss": 0.88067985, "num_input_tokens_seen": 21426835, "step": 1001, "time_per_iteration": 2.73445463180542 }, { "auxiliary_loss_clip": 0.01191917, "auxiliary_loss_mlp": 0.01062059, "balance_loss_clip": 1.06167853, "balance_loss_mlp": 1.03719211, "epoch": 0.06024349917330528, "flos": 25592960580480.0, "grad_norm": 1.8359711451165206, "language_loss": 0.74128318, "learning_rate": 3.990413541487551e-06, "loss": 0.76382297, "num_input_tokens_seen": 21444920, "step": 1002, "time_per_iteration": 2.8861100673675537 }, { "auxiliary_loss_clip": 0.01214316, "auxiliary_loss_mlp": 0.01062589, "balance_loss_clip": 1.06316125, "balance_loss_mlp": 1.03737664, "epoch": 0.060303622425973244, "flos": 26133271937280.0, "grad_norm": 2.1835040648243997, "language_loss": 0.75520515, "learning_rate": 3.990375417098112e-06, "loss": 0.77797419, "num_input_tokens_seen": 21463555, "step": 1003, "time_per_iteration": 2.632889747619629 }, { "auxiliary_loss_clip": 0.01187709, "auxiliary_loss_mlp": 0.01064806, "balance_loss_clip": 1.05934548, "balance_loss_mlp": 1.03928304, "epoch": 0.060363745678641216, "flos": 20377187418240.0, "grad_norm": 2.3150099602993155, "language_loss": 0.70349169, "learning_rate": 3.990337217233437e-06, "loss": 0.72601682, "num_input_tokens_seen": 21481990, "step": 1004, "time_per_iteration": 2.6947617530822754 }, { "auxiliary_loss_clip": 0.01212815, "auxiliary_loss_mlp": 0.01077454, "balance_loss_clip": 1.06629324, "balance_loss_mlp": 1.05168116, "epoch": 0.06042386893130918, "flos": 17749172753280.0, "grad_norm": 2.276868338025253, "language_loss": 0.83444524, "learning_rate": 3.990298941894976e-06, "loss": 0.85734791, "num_input_tokens_seen": 21500385, "step": 1005, "time_per_iteration": 2.581683397293091 }, { "auxiliary_loss_clip": 0.01077621, "auxiliary_loss_mlp": 0.01004707, "balance_loss_clip": 1.02541244, "balance_loss_mlp": 1.00029612, "epoch": 0.06048399218397715, "flos": 68538496872960.0, "grad_norm": 0.903813421793838, "language_loss": 0.59018111, "learning_rate": 3.9902605910841794e-06, "loss": 0.61100447, "num_input_tokens_seen": 21561040, "step": 1006, "time_per_iteration": 3.222104787826538 }, { "auxiliary_loss_clip": 0.01183553, "auxiliary_loss_mlp": 0.01059038, "balance_loss_clip": 1.05334234, "balance_loss_mlp": 1.03284812, "epoch": 0.060544115436645125, "flos": 23258515772160.0, "grad_norm": 2.1584333764290853, "language_loss": 0.74229443, "learning_rate": 3.990222164802503e-06, "loss": 0.76472032, "num_input_tokens_seen": 21580655, "step": 1007, "time_per_iteration": 2.7130653858184814 }, { "auxiliary_loss_clip": 0.0119408, "auxiliary_loss_mlp": 0.01060431, "balance_loss_clip": 1.06008601, "balance_loss_mlp": 1.03493261, "epoch": 0.06060423868931309, "flos": 23878441624320.0, "grad_norm": 1.7956876298455304, "language_loss": 0.8081426, "learning_rate": 3.9901836630514006e-06, "loss": 0.8306877, "num_input_tokens_seen": 21599650, "step": 1008, "time_per_iteration": 2.7151994705200195 }, { "auxiliary_loss_clip": 0.01175291, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.05982351, "balance_loss_mlp": 1.0305717, "epoch": 0.06066436194198106, "flos": 18728061171840.0, "grad_norm": 2.3069524306559837, "language_loss": 0.78198558, "learning_rate": 3.990145085832335e-06, "loss": 0.8043021, "num_input_tokens_seen": 21617550, "step": 1009, "time_per_iteration": 2.7313599586486816 }, { "auxiliary_loss_clip": 0.01194621, "auxiliary_loss_mlp": 0.01061233, "balance_loss_clip": 1.06150866, "balance_loss_mlp": 1.03726041, "epoch": 0.06072448519464903, "flos": 24640465680000.0, "grad_norm": 1.7452257697216769, "language_loss": 0.93148172, "learning_rate": 3.990106433146769e-06, "loss": 0.95404023, "num_input_tokens_seen": 21635865, "step": 1010, "time_per_iteration": 2.7233662605285645 }, { "auxiliary_loss_clip": 0.01148246, "auxiliary_loss_mlp": 0.00784144, "balance_loss_clip": 1.05304599, "balance_loss_mlp": 1.00029802, "epoch": 0.060784608447317, "flos": 17378825575680.0, "grad_norm": 2.9999367504779517, "language_loss": 0.72022474, "learning_rate": 3.9900677049961665e-06, "loss": 0.73954868, "num_input_tokens_seen": 21653945, "step": 1011, "time_per_iteration": 2.804858446121216 }, { "auxiliary_loss_clip": 0.01194231, "auxiliary_loss_mlp": 0.01077344, "balance_loss_clip": 1.05968046, "balance_loss_mlp": 1.04868615, "epoch": 0.06084473169998497, "flos": 23692208584320.0, "grad_norm": 1.9573218215833301, "language_loss": 0.87526691, "learning_rate": 3.990028901381999e-06, "loss": 0.89798272, "num_input_tokens_seen": 21671230, "step": 1012, "time_per_iteration": 2.6466245651245117 }, { "auxiliary_loss_clip": 0.01184459, "auxiliary_loss_mlp": 0.01064264, "balance_loss_clip": 1.05652905, "balance_loss_mlp": 1.03838325, "epoch": 0.06090485495265294, "flos": 23546339452800.0, "grad_norm": 1.9062230938156723, "language_loss": 0.76947677, "learning_rate": 3.989990022305734e-06, "loss": 0.79196405, "num_input_tokens_seen": 21691155, "step": 1013, "time_per_iteration": 4.297588586807251 }, { "auxiliary_loss_clip": 0.01207383, "auxiliary_loss_mlp": 0.00783488, "balance_loss_clip": 1.06573224, "balance_loss_mlp": 1.00034499, "epoch": 0.06096497820532091, "flos": 20339301548160.0, "grad_norm": 2.441711811862119, "language_loss": 0.86151874, "learning_rate": 3.98995106776885e-06, "loss": 0.88142747, "num_input_tokens_seen": 21707405, "step": 1014, "time_per_iteration": 4.301488637924194 }, { "auxiliary_loss_clip": 0.0121503, "auxiliary_loss_mlp": 0.01072817, "balance_loss_clip": 1.06605387, "balance_loss_mlp": 1.04508948, "epoch": 0.061025101457988874, "flos": 26939035779840.0, "grad_norm": 2.4309754772209184, "language_loss": 0.73197287, "learning_rate": 3.98991203777282e-06, "loss": 0.75485134, "num_input_tokens_seen": 21728090, "step": 1015, "time_per_iteration": 4.384514808654785 }, { "auxiliary_loss_clip": 0.01187374, "auxiliary_loss_mlp": 0.01068593, "balance_loss_clip": 1.06084347, "balance_loss_mlp": 1.04228365, "epoch": 0.061085224710656846, "flos": 25375054723200.0, "grad_norm": 1.5896529502124837, "language_loss": 0.79109907, "learning_rate": 3.9898729323191275e-06, "loss": 0.81365877, "num_input_tokens_seen": 21747950, "step": 1016, "time_per_iteration": 4.3249351978302 }, { "auxiliary_loss_clip": 0.01173015, "auxiliary_loss_mlp": 0.0105746, "balance_loss_clip": 1.06036103, "balance_loss_mlp": 1.03249741, "epoch": 0.06114534796332482, "flos": 24824759385600.0, "grad_norm": 1.6772682648410928, "language_loss": 0.76014191, "learning_rate": 3.989833751409254e-06, "loss": 0.78244662, "num_input_tokens_seen": 21767900, "step": 1017, "time_per_iteration": 2.7983243465423584 }, { "auxiliary_loss_clip": 0.01188817, "auxiliary_loss_mlp": 0.01074603, "balance_loss_clip": 1.06584609, "balance_loss_mlp": 1.0483532, "epoch": 0.061205471215992784, "flos": 20631434860800.0, "grad_norm": 2.001716657382839, "language_loss": 0.85798436, "learning_rate": 3.989794495044685e-06, "loss": 0.88061857, "num_input_tokens_seen": 21787375, "step": 1018, "time_per_iteration": 2.702399253845215 }, { "auxiliary_loss_clip": 0.01174344, "auxiliary_loss_mlp": 0.01069438, "balance_loss_clip": 1.06325769, "balance_loss_mlp": 1.04231787, "epoch": 0.061265594468660756, "flos": 16508351381760.0, "grad_norm": 2.9546929267460813, "language_loss": 0.76985347, "learning_rate": 3.989755163226909e-06, "loss": 0.79229128, "num_input_tokens_seen": 21806275, "step": 1019, "time_per_iteration": 2.780104875564575 }, { "auxiliary_loss_clip": 0.01160861, "auxiliary_loss_mlp": 0.0106141, "balance_loss_clip": 1.05355084, "balance_loss_mlp": 1.03511262, "epoch": 0.06132571772132872, "flos": 26246211275520.0, "grad_norm": 2.1848809329980288, "language_loss": 0.84122044, "learning_rate": 3.989715755957418e-06, "loss": 0.86344314, "num_input_tokens_seen": 21826430, "step": 1020, "time_per_iteration": 2.785963535308838 }, { "auxiliary_loss_clip": 0.01198473, "auxiliary_loss_mlp": 0.01063342, "balance_loss_clip": 1.06365371, "balance_loss_mlp": 1.03604269, "epoch": 0.06138584097399669, "flos": 37414788768000.0, "grad_norm": 1.933053672026977, "language_loss": 0.79114467, "learning_rate": 3.989676273237705e-06, "loss": 0.81376278, "num_input_tokens_seen": 21847800, "step": 1021, "time_per_iteration": 2.7968955039978027 }, { "auxiliary_loss_clip": 0.01189659, "auxiliary_loss_mlp": 0.01064044, "balance_loss_clip": 1.06159925, "balance_loss_mlp": 1.04114437, "epoch": 0.061445964226664665, "flos": 17420661941760.0, "grad_norm": 2.089525934673828, "language_loss": 0.87768298, "learning_rate": 3.9896367150692705e-06, "loss": 0.90022004, "num_input_tokens_seen": 21863385, "step": 1022, "time_per_iteration": 2.70906138420105 }, { "auxiliary_loss_clip": 0.01198737, "auxiliary_loss_mlp": 0.0106635, "balance_loss_clip": 1.06627858, "balance_loss_mlp": 1.04079151, "epoch": 0.06150608747933263, "flos": 22600021691520.0, "grad_norm": 1.7284486983379121, "language_loss": 0.82892007, "learning_rate": 3.989597081453611e-06, "loss": 0.85157096, "num_input_tokens_seen": 21881880, "step": 1023, "time_per_iteration": 2.71539568901062 }, { "auxiliary_loss_clip": 0.01100664, "auxiliary_loss_mlp": 0.01010751, "balance_loss_clip": 1.03727341, "balance_loss_mlp": 1.00614953, "epoch": 0.0615662107320006, "flos": 56741482005120.0, "grad_norm": 0.8894752517384502, "language_loss": 0.6505782, "learning_rate": 3.989557372392231e-06, "loss": 0.67169237, "num_input_tokens_seen": 21940550, "step": 1024, "time_per_iteration": 3.175217628479004 }, { "auxiliary_loss_clip": 0.01167458, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.05906856, "balance_loss_mlp": 1.04553604, "epoch": 0.06162633398466857, "flos": 22564793427840.0, "grad_norm": 2.320347485789288, "language_loss": 0.88069236, "learning_rate": 3.989517587886636e-06, "loss": 0.90310359, "num_input_tokens_seen": 21958390, "step": 1025, "time_per_iteration": 2.690725564956665 }, { "auxiliary_loss_clip": 0.01197064, "auxiliary_loss_mlp": 0.01066504, "balance_loss_clip": 1.06452, "balance_loss_mlp": 1.04173219, "epoch": 0.06168645723733654, "flos": 25593104234880.0, "grad_norm": 2.5217294712155414, "language_loss": 0.84536898, "learning_rate": 3.989477727938335e-06, "loss": 0.86800468, "num_input_tokens_seen": 21978625, "step": 1026, "time_per_iteration": 2.7420806884765625 }, { "auxiliary_loss_clip": 0.01160797, "auxiliary_loss_mlp": 0.0107525, "balance_loss_clip": 1.05669701, "balance_loss_mlp": 1.04934609, "epoch": 0.06174658049000451, "flos": 15997917162240.0, "grad_norm": 2.354014397396182, "language_loss": 0.8228389, "learning_rate": 3.989437792548839e-06, "loss": 0.84519935, "num_input_tokens_seen": 21996035, "step": 1027, "time_per_iteration": 2.6683874130249023 }, { "auxiliary_loss_clip": 0.01160199, "auxiliary_loss_mlp": 0.01067253, "balance_loss_clip": 1.06181073, "balance_loss_mlp": 1.04232645, "epoch": 0.06180670374267248, "flos": 11285970117120.0, "grad_norm": 4.43492107605727, "language_loss": 0.83898664, "learning_rate": 3.989397781719663e-06, "loss": 0.86126107, "num_input_tokens_seen": 22011625, "step": 1028, "time_per_iteration": 2.705387592315674 }, { "auxiliary_loss_clip": 0.0106503, "auxiliary_loss_mlp": 0.01008074, "balance_loss_clip": 1.02410197, "balance_loss_mlp": 1.00347257, "epoch": 0.06186682699534045, "flos": 65130142216320.0, "grad_norm": 0.9383255649985517, "language_loss": 0.604738, "learning_rate": 3.989357695452323e-06, "loss": 0.62546903, "num_input_tokens_seen": 22066035, "step": 1029, "time_per_iteration": 3.0268616676330566 }, { "auxiliary_loss_clip": 0.01176182, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.05641246, "balance_loss_mlp": 1.04737473, "epoch": 0.061926950248008414, "flos": 21105742976640.0, "grad_norm": 4.246634693563946, "language_loss": 0.82589179, "learning_rate": 3.98931753374834e-06, "loss": 0.84839535, "num_input_tokens_seen": 22085015, "step": 1030, "time_per_iteration": 2.7035892009735107 }, { "auxiliary_loss_clip": 0.0122298, "auxiliary_loss_mlp": 0.01077745, "balance_loss_clip": 1.06850278, "balance_loss_mlp": 1.05185235, "epoch": 0.061987073500676386, "flos": 17748454481280.0, "grad_norm": 2.585240230669548, "language_loss": 0.79576576, "learning_rate": 3.989277296609237e-06, "loss": 0.81877303, "num_input_tokens_seen": 22102775, "step": 1031, "time_per_iteration": 2.60622501373291 }, { "auxiliary_loss_clip": 0.01188957, "auxiliary_loss_mlp": 0.01076754, "balance_loss_clip": 1.06396544, "balance_loss_mlp": 1.04982424, "epoch": 0.06204719675334436, "flos": 21836237869440.0, "grad_norm": 1.8815476991595563, "language_loss": 0.77384412, "learning_rate": 3.98923698403654e-06, "loss": 0.79650116, "num_input_tokens_seen": 22121680, "step": 1032, "time_per_iteration": 2.6753971576690674 }, { "auxiliary_loss_clip": 0.01198757, "auxiliary_loss_mlp": 0.01074736, "balance_loss_clip": 1.05916619, "balance_loss_mlp": 1.04848623, "epoch": 0.06210732000601232, "flos": 19353697286400.0, "grad_norm": 3.147941025479245, "language_loss": 0.89323574, "learning_rate": 3.989196596031776e-06, "loss": 0.91597068, "num_input_tokens_seen": 22138155, "step": 1033, "time_per_iteration": 2.7313079833984375 }, { "auxiliary_loss_clip": 0.01209161, "auxiliary_loss_mlp": 0.01066082, "balance_loss_clip": 1.06214237, "balance_loss_mlp": 1.04119134, "epoch": 0.062167443258680295, "flos": 24749382695040.0, "grad_norm": 2.1035343880884145, "language_loss": 0.8455385, "learning_rate": 3.989156132596479e-06, "loss": 0.8682909, "num_input_tokens_seen": 22157420, "step": 1034, "time_per_iteration": 2.7541439533233643 }, { "auxiliary_loss_clip": 0.01180042, "auxiliary_loss_mlp": 0.01057312, "balance_loss_clip": 1.05896068, "balance_loss_mlp": 1.03155136, "epoch": 0.06222756651134827, "flos": 34458478773120.0, "grad_norm": 1.8983498110529735, "language_loss": 0.8082794, "learning_rate": 3.989115593732182e-06, "loss": 0.83065289, "num_input_tokens_seen": 22178620, "step": 1035, "time_per_iteration": 2.7965424060821533 }, { "auxiliary_loss_clip": 0.01158806, "auxiliary_loss_mlp": 0.01072478, "balance_loss_clip": 1.05936599, "balance_loss_mlp": 1.04432034, "epoch": 0.06228768976401623, "flos": 25666469763840.0, "grad_norm": 2.145216314952277, "language_loss": 0.78365827, "learning_rate": 3.989074979440421e-06, "loss": 0.80597103, "num_input_tokens_seen": 22197125, "step": 1036, "time_per_iteration": 2.7858450412750244 }, { "auxiliary_loss_clip": 0.01192097, "auxiliary_loss_mlp": 0.01071382, "balance_loss_clip": 1.05977845, "balance_loss_mlp": 1.04663444, "epoch": 0.062347813016684205, "flos": 25295619795840.0, "grad_norm": 1.9535870339716077, "language_loss": 0.86544567, "learning_rate": 3.989034289722739e-06, "loss": 0.88808048, "num_input_tokens_seen": 22217575, "step": 1037, "time_per_iteration": 2.685373306274414 }, { "auxiliary_loss_clip": 0.01197778, "auxiliary_loss_mlp": 0.01057095, "balance_loss_clip": 1.06127763, "balance_loss_mlp": 1.02966499, "epoch": 0.06240793626935217, "flos": 26907039740160.0, "grad_norm": 2.697396725345887, "language_loss": 0.8067717, "learning_rate": 3.988993524580676e-06, "loss": 0.82932043, "num_input_tokens_seen": 22236840, "step": 1038, "time_per_iteration": 2.7305831909179688 }, { "auxiliary_loss_clip": 0.01145896, "auxiliary_loss_mlp": 0.01072721, "balance_loss_clip": 1.05226004, "balance_loss_mlp": 1.04330015, "epoch": 0.06246805952202014, "flos": 21615782146560.0, "grad_norm": 1.8888526922505675, "language_loss": 0.85465872, "learning_rate": 3.98895268401578e-06, "loss": 0.87684488, "num_input_tokens_seen": 22256465, "step": 1039, "time_per_iteration": 2.7351109981536865 }, { "auxiliary_loss_clip": 0.01188545, "auxiliary_loss_mlp": 0.01070323, "balance_loss_clip": 1.05834138, "balance_loss_mlp": 1.04472923, "epoch": 0.0625281827746881, "flos": 19311896833920.0, "grad_norm": 2.217895985816133, "language_loss": 0.81172895, "learning_rate": 3.9889117680296e-06, "loss": 0.83431756, "num_input_tokens_seen": 22274025, "step": 1040, "time_per_iteration": 2.6532907485961914 }, { "auxiliary_loss_clip": 0.0121654, "auxiliary_loss_mlp": 0.0106312, "balance_loss_clip": 1.06718016, "balance_loss_mlp": 1.03808582, "epoch": 0.06258830602735609, "flos": 27745769289600.0, "grad_norm": 2.1960038080149817, "language_loss": 0.69304991, "learning_rate": 3.988870776623685e-06, "loss": 0.71584648, "num_input_tokens_seen": 22292245, "step": 1041, "time_per_iteration": 2.6445486545562744 }, { "auxiliary_loss_clip": 0.01214659, "auxiliary_loss_mlp": 0.01057975, "balance_loss_clip": 1.06247008, "balance_loss_mlp": 1.03182077, "epoch": 0.06264842928002405, "flos": 23222605150080.0, "grad_norm": 2.7326158002445, "language_loss": 0.81187552, "learning_rate": 3.9888297097995905e-06, "loss": 0.83460188, "num_input_tokens_seen": 22311455, "step": 1042, "time_per_iteration": 2.6111559867858887 }, { "auxiliary_loss_clip": 0.01211653, "auxiliary_loss_mlp": 0.01052676, "balance_loss_clip": 1.06253886, "balance_loss_mlp": 1.02871442, "epoch": 0.06270855253269202, "flos": 38399495189760.0, "grad_norm": 1.7165873820424848, "language_loss": 0.76349056, "learning_rate": 3.988788567558874e-06, "loss": 0.78613389, "num_input_tokens_seen": 22333750, "step": 1043, "time_per_iteration": 2.761768341064453 }, { "auxiliary_loss_clip": 0.0118944, "auxiliary_loss_mlp": 0.01063189, "balance_loss_clip": 1.06111181, "balance_loss_mlp": 1.03912091, "epoch": 0.06276867578535998, "flos": 22453542028800.0, "grad_norm": 8.34017761542712, "language_loss": 0.92031956, "learning_rate": 3.988747349903097e-06, "loss": 0.94284582, "num_input_tokens_seen": 22351940, "step": 1044, "time_per_iteration": 2.636179208755493 }, { "auxiliary_loss_clip": 0.01192566, "auxiliary_loss_mlp": 0.01070128, "balance_loss_clip": 1.05862689, "balance_loss_mlp": 1.0456785, "epoch": 0.06282879903802796, "flos": 22930435923840.0, "grad_norm": 2.3486674311430944, "language_loss": 0.85913992, "learning_rate": 3.988706056833821e-06, "loss": 0.88176692, "num_input_tokens_seen": 22372085, "step": 1045, "time_per_iteration": 2.7749502658843994 }, { "auxiliary_loss_clip": 0.01179197, "auxiliary_loss_mlp": 0.01065176, "balance_loss_clip": 1.05804443, "balance_loss_mlp": 1.04053521, "epoch": 0.06288892229069593, "flos": 34819237019520.0, "grad_norm": 1.9846122850853416, "language_loss": 0.7796576, "learning_rate": 3.9886646883526125e-06, "loss": 0.80210131, "num_input_tokens_seen": 22392020, "step": 1046, "time_per_iteration": 2.803135871887207 }, { "auxiliary_loss_clip": 0.01197344, "auxiliary_loss_mlp": 0.01069269, "balance_loss_clip": 1.06361508, "balance_loss_mlp": 1.04558206, "epoch": 0.06294904554336389, "flos": 19427134642560.0, "grad_norm": 2.174325060947129, "language_loss": 0.77326387, "learning_rate": 3.988623244461039e-06, "loss": 0.79592997, "num_input_tokens_seen": 22411180, "step": 1047, "time_per_iteration": 2.647446632385254 }, { "auxiliary_loss_clip": 0.01200907, "auxiliary_loss_mlp": 0.0105799, "balance_loss_clip": 1.06238222, "balance_loss_mlp": 1.03314662, "epoch": 0.06300916879603187, "flos": 40661867358720.0, "grad_norm": 2.4899372640825046, "language_loss": 0.77190751, "learning_rate": 3.988581725160672e-06, "loss": 0.79449654, "num_input_tokens_seen": 22435105, "step": 1048, "time_per_iteration": 2.8167293071746826 }, { "auxiliary_loss_clip": 0.0118184, "auxiliary_loss_mlp": 0.01064361, "balance_loss_clip": 1.0613215, "balance_loss_mlp": 1.03914821, "epoch": 0.06306929204869983, "flos": 23804142341760.0, "grad_norm": 4.606540291271834, "language_loss": 0.77258086, "learning_rate": 3.988540130453087e-06, "loss": 0.79504287, "num_input_tokens_seen": 22452710, "step": 1049, "time_per_iteration": 2.6908538341522217 }, { "auxiliary_loss_clip": 0.01194538, "auxiliary_loss_mlp": 0.0105682, "balance_loss_clip": 1.06043661, "balance_loss_mlp": 1.03290701, "epoch": 0.0631294153013678, "flos": 18915802583040.0, "grad_norm": 2.515998307474139, "language_loss": 0.83302009, "learning_rate": 3.988498460339862e-06, "loss": 0.85553372, "num_input_tokens_seen": 22470175, "step": 1050, "time_per_iteration": 2.62186861038208 }, { "auxiliary_loss_clip": 0.01210654, "auxiliary_loss_mlp": 0.01062894, "balance_loss_clip": 1.06468701, "balance_loss_mlp": 1.04008913, "epoch": 0.06318953855403578, "flos": 24280174310400.0, "grad_norm": 5.5478202090132065, "language_loss": 0.76564771, "learning_rate": 3.988456714822575e-06, "loss": 0.78838319, "num_input_tokens_seen": 22490020, "step": 1051, "time_per_iteration": 2.732269525527954 }, { "auxiliary_loss_clip": 0.01188416, "auxiliary_loss_mlp": 0.01069443, "balance_loss_clip": 1.06340146, "balance_loss_mlp": 1.04492211, "epoch": 0.06324966180670374, "flos": 22528918719360.0, "grad_norm": 1.9993900469270787, "language_loss": 0.80410004, "learning_rate": 3.98841489390281e-06, "loss": 0.82667863, "num_input_tokens_seen": 22509685, "step": 1052, "time_per_iteration": 2.7683873176574707 }, { "auxiliary_loss_clip": 0.01211333, "auxiliary_loss_mlp": 0.01058255, "balance_loss_clip": 1.06324601, "balance_loss_mlp": 1.03468728, "epoch": 0.06330978505937171, "flos": 15778107884160.0, "grad_norm": 2.370007457349547, "language_loss": 0.77433288, "learning_rate": 3.988372997582155e-06, "loss": 0.79702866, "num_input_tokens_seen": 22527905, "step": 1053, "time_per_iteration": 5.757168531417847 }, { "auxiliary_loss_clip": 0.01190721, "auxiliary_loss_mlp": 0.00780448, "balance_loss_clip": 1.06378174, "balance_loss_mlp": 1.00028598, "epoch": 0.06336990831203967, "flos": 21471098163840.0, "grad_norm": 3.085258828985267, "language_loss": 0.84931248, "learning_rate": 3.988331025862195e-06, "loss": 0.86902416, "num_input_tokens_seen": 22546335, "step": 1054, "time_per_iteration": 2.7733829021453857 }, { "auxiliary_loss_clip": 0.01172281, "auxiliary_loss_mlp": 0.01061232, "balance_loss_clip": 1.05722666, "balance_loss_mlp": 1.03753328, "epoch": 0.06343003156470765, "flos": 18478877546880.0, "grad_norm": 2.0168531459993435, "language_loss": 0.85884213, "learning_rate": 3.9882889787445225e-06, "loss": 0.88117731, "num_input_tokens_seen": 22563885, "step": 1055, "time_per_iteration": 4.490305185317993 }, { "auxiliary_loss_clip": 0.01164237, "auxiliary_loss_mlp": 0.01069785, "balance_loss_clip": 1.05727792, "balance_loss_mlp": 1.04534709, "epoch": 0.06349015481737562, "flos": 25154886309120.0, "grad_norm": 2.4509218988768, "language_loss": 0.8113938, "learning_rate": 3.988246856230734e-06, "loss": 0.83373404, "num_input_tokens_seen": 22583035, "step": 1056, "time_per_iteration": 5.345282793045044 }, { "auxiliary_loss_clip": 0.01144181, "auxiliary_loss_mlp": 0.01061125, "balance_loss_clip": 1.04991364, "balance_loss_mlp": 1.03449368, "epoch": 0.06355027807004358, "flos": 26871775562880.0, "grad_norm": 2.2117272688527128, "language_loss": 0.81083393, "learning_rate": 3.988204658322426e-06, "loss": 0.83288693, "num_input_tokens_seen": 22605055, "step": 1057, "time_per_iteration": 2.866757392883301 }, { "auxiliary_loss_clip": 0.01139076, "auxiliary_loss_mlp": 0.01061742, "balance_loss_clip": 1.04970908, "balance_loss_mlp": 1.03918755, "epoch": 0.06361040132271156, "flos": 21396691140480.0, "grad_norm": 1.9636971972870172, "language_loss": 0.83353591, "learning_rate": 3.988162385021196e-06, "loss": 0.85554409, "num_input_tokens_seen": 22623760, "step": 1058, "time_per_iteration": 2.767024278640747 }, { "auxiliary_loss_clip": 0.0117752, "auxiliary_loss_mlp": 0.01059639, "balance_loss_clip": 1.0576936, "balance_loss_mlp": 1.03408027, "epoch": 0.06367052457537953, "flos": 25733765894400.0, "grad_norm": 2.137077300251244, "language_loss": 0.87556928, "learning_rate": 3.988120036328651e-06, "loss": 0.89794087, "num_input_tokens_seen": 22643000, "step": 1059, "time_per_iteration": 2.794734239578247 }, { "auxiliary_loss_clip": 0.01169658, "auxiliary_loss_mlp": 0.01063463, "balance_loss_clip": 1.06196678, "balance_loss_mlp": 1.0383693, "epoch": 0.0637306478280475, "flos": 17631420992640.0, "grad_norm": 2.543966627588717, "language_loss": 0.91561133, "learning_rate": 3.988077612246394e-06, "loss": 0.93794256, "num_input_tokens_seen": 22660460, "step": 1060, "time_per_iteration": 2.8223626613616943 }, { "auxiliary_loss_clip": 0.01173933, "auxiliary_loss_mlp": 0.01065151, "balance_loss_clip": 1.05715585, "balance_loss_mlp": 1.03981876, "epoch": 0.06379077108071547, "flos": 13662610427520.0, "grad_norm": 1.9401711052692647, "language_loss": 0.87242293, "learning_rate": 3.988035112776035e-06, "loss": 0.89481378, "num_input_tokens_seen": 22679270, "step": 1061, "time_per_iteration": 2.7783865928649902 }, { "auxiliary_loss_clip": 0.01190039, "auxiliary_loss_mlp": 0.01059971, "balance_loss_clip": 1.05976009, "balance_loss_mlp": 1.03388786, "epoch": 0.06385089433338344, "flos": 28478849961600.0, "grad_norm": 5.360593029379932, "language_loss": 0.77407908, "learning_rate": 3.987992537919185e-06, "loss": 0.79657912, "num_input_tokens_seen": 22699330, "step": 1062, "time_per_iteration": 2.872587203979492 }, { "auxiliary_loss_clip": 0.01172912, "auxiliary_loss_mlp": 0.01061175, "balance_loss_clip": 1.05884075, "balance_loss_mlp": 1.03798842, "epoch": 0.0639110175860514, "flos": 24311057028480.0, "grad_norm": 2.2658654128491436, "language_loss": 0.86522883, "learning_rate": 3.987949887677459e-06, "loss": 0.88756967, "num_input_tokens_seen": 22717945, "step": 1063, "time_per_iteration": 2.7915029525756836 }, { "auxiliary_loss_clip": 0.01207773, "auxiliary_loss_mlp": 0.01062698, "balance_loss_clip": 1.05969334, "balance_loss_mlp": 1.03846335, "epoch": 0.06397114083871938, "flos": 22090772620800.0, "grad_norm": 2.302236346678267, "language_loss": 0.79908657, "learning_rate": 3.9879071620524744e-06, "loss": 0.82179129, "num_input_tokens_seen": 22736790, "step": 1064, "time_per_iteration": 2.6880991458892822 }, { "auxiliary_loss_clip": 0.01198826, "auxiliary_loss_mlp": 0.01066465, "balance_loss_clip": 1.0603801, "balance_loss_mlp": 1.04149103, "epoch": 0.06403126409138735, "flos": 19572824206080.0, "grad_norm": 3.1552731138796215, "language_loss": 0.84327948, "learning_rate": 3.987864361045851e-06, "loss": 0.8659324, "num_input_tokens_seen": 22754745, "step": 1065, "time_per_iteration": 2.6956398487091064 }, { "auxiliary_loss_clip": 0.01168098, "auxiliary_loss_mlp": 0.01054905, "balance_loss_clip": 1.0597136, "balance_loss_mlp": 1.03162324, "epoch": 0.06409138734405531, "flos": 40807413267840.0, "grad_norm": 1.52830872536012, "language_loss": 0.68177885, "learning_rate": 3.987821484659211e-06, "loss": 0.70400894, "num_input_tokens_seen": 22776780, "step": 1066, "time_per_iteration": 2.9867773056030273 }, { "auxiliary_loss_clip": 0.01214184, "auxiliary_loss_mlp": 0.01070649, "balance_loss_clip": 1.06780005, "balance_loss_mlp": 1.04609215, "epoch": 0.06415151059672328, "flos": 20441610460800.0, "grad_norm": 1.8546001537284342, "language_loss": 0.90349269, "learning_rate": 3.987778532894181e-06, "loss": 0.926341, "num_input_tokens_seen": 22793915, "step": 1067, "time_per_iteration": 2.685896873474121 }, { "auxiliary_loss_clip": 0.01188134, "auxiliary_loss_mlp": 0.01063022, "balance_loss_clip": 1.0623709, "balance_loss_mlp": 1.03969264, "epoch": 0.06421163384939126, "flos": 18072045129600.0, "grad_norm": 2.189788428445167, "language_loss": 0.83437371, "learning_rate": 3.987735505752391e-06, "loss": 0.85688531, "num_input_tokens_seen": 22812670, "step": 1068, "time_per_iteration": 2.851602554321289 }, { "auxiliary_loss_clip": 0.01178972, "auxiliary_loss_mlp": 0.01057745, "balance_loss_clip": 1.05909026, "balance_loss_mlp": 1.03426039, "epoch": 0.06427175710205922, "flos": 25119442563840.0, "grad_norm": 3.045176948020938, "language_loss": 0.89311272, "learning_rate": 3.987692403235471e-06, "loss": 0.9154799, "num_input_tokens_seen": 22832440, "step": 1069, "time_per_iteration": 2.7825255393981934 }, { "auxiliary_loss_clip": 0.01185672, "auxiliary_loss_mlp": 0.01071834, "balance_loss_clip": 1.06158304, "balance_loss_mlp": 1.04663301, "epoch": 0.06433188035472719, "flos": 17380549428480.0, "grad_norm": 2.7038488706194808, "language_loss": 0.95759481, "learning_rate": 3.987649225345056e-06, "loss": 0.98016989, "num_input_tokens_seen": 22845495, "step": 1070, "time_per_iteration": 2.715296506881714 }, { "auxiliary_loss_clip": 0.01140792, "auxiliary_loss_mlp": 0.01056718, "balance_loss_clip": 1.05607581, "balance_loss_mlp": 1.03027749, "epoch": 0.06439200360739517, "flos": 23546267625600.0, "grad_norm": 1.630790580283393, "language_loss": 0.8811003, "learning_rate": 3.987605972082782e-06, "loss": 0.90307534, "num_input_tokens_seen": 22865390, "step": 1071, "time_per_iteration": 2.8445394039154053 }, { "auxiliary_loss_clip": 0.01155172, "auxiliary_loss_mlp": 0.01054986, "balance_loss_clip": 1.05483651, "balance_loss_mlp": 1.03102481, "epoch": 0.06445212686006313, "flos": 21979772616960.0, "grad_norm": 1.8349443396730127, "language_loss": 0.76116478, "learning_rate": 3.987562643450292e-06, "loss": 0.78326637, "num_input_tokens_seen": 22885495, "step": 1072, "time_per_iteration": 2.8330819606781006 }, { "auxiliary_loss_clip": 0.01172997, "auxiliary_loss_mlp": 0.01070104, "balance_loss_clip": 1.05975842, "balance_loss_mlp": 1.04362798, "epoch": 0.0645122501127311, "flos": 25921291824000.0, "grad_norm": 2.724283900767911, "language_loss": 0.80849886, "learning_rate": 3.987519239449226e-06, "loss": 0.83092993, "num_input_tokens_seen": 22904845, "step": 1073, "time_per_iteration": 2.748286247253418 }, { "auxiliary_loss_clip": 0.01194712, "auxiliary_loss_mlp": 0.01062452, "balance_loss_clip": 1.06345201, "balance_loss_mlp": 1.03825283, "epoch": 0.06457237336539907, "flos": 25626034028160.0, "grad_norm": 5.0538746884234245, "language_loss": 0.80282539, "learning_rate": 3.987475760081233e-06, "loss": 0.82539707, "num_input_tokens_seen": 22925940, "step": 1074, "time_per_iteration": 2.7482337951660156 }, { "auxiliary_loss_clip": 0.01173366, "auxiliary_loss_mlp": 0.01057774, "balance_loss_clip": 1.05920076, "balance_loss_mlp": 1.03256142, "epoch": 0.06463249661806704, "flos": 19463979018240.0, "grad_norm": 2.0209756517373707, "language_loss": 0.79249811, "learning_rate": 3.987432205347958e-06, "loss": 0.8148095, "num_input_tokens_seen": 22944375, "step": 1075, "time_per_iteration": 2.6937224864959717 }, { "auxiliary_loss_clip": 0.01171297, "auxiliary_loss_mlp": 0.01063569, "balance_loss_clip": 1.05735481, "balance_loss_mlp": 1.04025126, "epoch": 0.064692619870735, "flos": 24498044254080.0, "grad_norm": 2.9028991302223357, "language_loss": 0.88208115, "learning_rate": 3.987388575251055e-06, "loss": 0.90442967, "num_input_tokens_seen": 22959145, "step": 1076, "time_per_iteration": 2.878103256225586 }, { "auxiliary_loss_clip": 0.01192915, "auxiliary_loss_mlp": 0.01052877, "balance_loss_clip": 1.06164443, "balance_loss_mlp": 1.0288558, "epoch": 0.06475274312340297, "flos": 17018677860480.0, "grad_norm": 2.225760792581628, "language_loss": 0.80876106, "learning_rate": 3.98734486979218e-06, "loss": 0.83121902, "num_input_tokens_seen": 22978100, "step": 1077, "time_per_iteration": 2.7221076488494873 }, { "auxiliary_loss_clip": 0.01200466, "auxiliary_loss_mlp": 0.01064019, "balance_loss_clip": 1.0656153, "balance_loss_mlp": 1.03866291, "epoch": 0.06481286637607095, "flos": 24572379450240.0, "grad_norm": 2.256787147683815, "language_loss": 0.91727465, "learning_rate": 3.987301088972986e-06, "loss": 0.93991947, "num_input_tokens_seen": 22997285, "step": 1078, "time_per_iteration": 2.862365484237671 }, { "auxiliary_loss_clip": 0.0122435, "auxiliary_loss_mlp": 0.01060225, "balance_loss_clip": 1.06826639, "balance_loss_mlp": 1.03552508, "epoch": 0.06487298962873891, "flos": 21105635235840.0, "grad_norm": 2.080056711608912, "language_loss": 0.78349572, "learning_rate": 3.987257232795137e-06, "loss": 0.80634147, "num_input_tokens_seen": 23016285, "step": 1079, "time_per_iteration": 2.6435368061065674 }, { "auxiliary_loss_clip": 0.01156927, "auxiliary_loss_mlp": 0.01063794, "balance_loss_clip": 1.05512071, "balance_loss_mlp": 1.03899896, "epoch": 0.06493311288140688, "flos": 24608182331520.0, "grad_norm": 2.274862403364013, "language_loss": 0.68702769, "learning_rate": 3.987213301260294e-06, "loss": 0.70923495, "num_input_tokens_seen": 23036420, "step": 1080, "time_per_iteration": 2.7782626152038574 }, { "auxiliary_loss_clip": 0.01175684, "auxiliary_loss_mlp": 0.01062351, "balance_loss_clip": 1.06640029, "balance_loss_mlp": 1.03610086, "epoch": 0.06499323613407486, "flos": 25337994865920.0, "grad_norm": 1.886196453243775, "language_loss": 0.72291583, "learning_rate": 3.987169294370123e-06, "loss": 0.74529618, "num_input_tokens_seen": 23056945, "step": 1081, "time_per_iteration": 2.7983880043029785 }, { "auxiliary_loss_clip": 0.01139671, "auxiliary_loss_mlp": 0.01066686, "balance_loss_clip": 1.0504055, "balance_loss_mlp": 1.04076982, "epoch": 0.06505335938674282, "flos": 20375714960640.0, "grad_norm": 3.3093934650613566, "language_loss": 0.84059012, "learning_rate": 3.987125212126294e-06, "loss": 0.86265367, "num_input_tokens_seen": 23074940, "step": 1082, "time_per_iteration": 2.8351900577545166 }, { "auxiliary_loss_clip": 0.01204185, "auxiliary_loss_mlp": 0.01063692, "balance_loss_clip": 1.06306195, "balance_loss_mlp": 1.03809738, "epoch": 0.06511348263941079, "flos": 25337923038720.0, "grad_norm": 2.894360492506304, "language_loss": 0.82550305, "learning_rate": 3.987081054530478e-06, "loss": 0.84818184, "num_input_tokens_seen": 23093420, "step": 1083, "time_per_iteration": 2.866729974746704 }, { "auxiliary_loss_clip": 0.01168245, "auxiliary_loss_mlp": 0.01062938, "balance_loss_clip": 1.06021011, "balance_loss_mlp": 1.03655696, "epoch": 0.06517360589207877, "flos": 20332801186560.0, "grad_norm": 2.468736383036802, "language_loss": 0.79289383, "learning_rate": 3.987036821584348e-06, "loss": 0.81520569, "num_input_tokens_seen": 23111550, "step": 1084, "time_per_iteration": 2.816601276397705 }, { "auxiliary_loss_clip": 0.01174068, "auxiliary_loss_mlp": 0.0106167, "balance_loss_clip": 1.05854714, "balance_loss_mlp": 1.03667152, "epoch": 0.06523372914474673, "flos": 31681650061440.0, "grad_norm": 2.571590277205686, "language_loss": 0.66443276, "learning_rate": 3.986992513289584e-06, "loss": 0.68679011, "num_input_tokens_seen": 23130335, "step": 1085, "time_per_iteration": 2.8260092735290527 }, { "auxiliary_loss_clip": 0.01170818, "auxiliary_loss_mlp": 0.01062435, "balance_loss_clip": 1.0600934, "balance_loss_mlp": 1.03833067, "epoch": 0.0652938523974147, "flos": 20778165918720.0, "grad_norm": 2.0478791529086977, "language_loss": 0.76548934, "learning_rate": 3.9869481296478645e-06, "loss": 0.78782183, "num_input_tokens_seen": 23152380, "step": 1086, "time_per_iteration": 2.7937023639678955 }, { "auxiliary_loss_clip": 0.01198609, "auxiliary_loss_mlp": 0.01059288, "balance_loss_clip": 1.06335294, "balance_loss_mlp": 1.03519547, "epoch": 0.06535397565008266, "flos": 16690993061760.0, "grad_norm": 2.1629448601391017, "language_loss": 0.85109925, "learning_rate": 3.986903670660872e-06, "loss": 0.87367821, "num_input_tokens_seen": 23171630, "step": 1087, "time_per_iteration": 2.7510013580322266 }, { "auxiliary_loss_clip": 0.01184978, "auxiliary_loss_mlp": 0.01059017, "balance_loss_clip": 1.06293821, "balance_loss_mlp": 1.03510392, "epoch": 0.06541409890275064, "flos": 26868220116480.0, "grad_norm": 1.7886353193129139, "language_loss": 0.77776635, "learning_rate": 3.9868591363302945e-06, "loss": 0.80020636, "num_input_tokens_seen": 23192520, "step": 1088, "time_per_iteration": 2.7792751789093018 }, { "auxiliary_loss_clip": 0.01192707, "auxiliary_loss_mlp": 0.01067634, "balance_loss_clip": 1.06569457, "balance_loss_mlp": 1.04498422, "epoch": 0.06547422215541861, "flos": 20521620005760.0, "grad_norm": 3.0334087154373375, "language_loss": 0.71050513, "learning_rate": 3.9868145266578186e-06, "loss": 0.73310852, "num_input_tokens_seen": 23210710, "step": 1089, "time_per_iteration": 2.8832852840423584 }, { "auxiliary_loss_clip": 0.01173663, "auxiliary_loss_mlp": 0.00781529, "balance_loss_clip": 1.06159782, "balance_loss_mlp": 1.00019014, "epoch": 0.06553434540808657, "flos": 22016616992640.0, "grad_norm": 2.02973275746688, "language_loss": 0.85650897, "learning_rate": 3.9867698416451366e-06, "loss": 0.87606084, "num_input_tokens_seen": 23230305, "step": 1090, "time_per_iteration": 2.7933149337768555 }, { "auxiliary_loss_clip": 0.01214666, "auxiliary_loss_mlp": 0.0105885, "balance_loss_clip": 1.06735325, "balance_loss_mlp": 1.03460288, "epoch": 0.06559446866075455, "flos": 24608649208320.0, "grad_norm": 2.137212216289862, "language_loss": 0.71829313, "learning_rate": 3.9867250812939434e-06, "loss": 0.74102825, "num_input_tokens_seen": 23249015, "step": 1091, "time_per_iteration": 2.646592855453491 }, { "auxiliary_loss_clip": 0.01121055, "auxiliary_loss_mlp": 0.0106405, "balance_loss_clip": 1.05242276, "balance_loss_mlp": 1.03961205, "epoch": 0.06565459191342252, "flos": 24274679529600.0, "grad_norm": 2.2773849385721956, "language_loss": 0.82839823, "learning_rate": 3.986680245605936e-06, "loss": 0.85024923, "num_input_tokens_seen": 23265105, "step": 1092, "time_per_iteration": 4.799649715423584 }, { "auxiliary_loss_clip": 0.01215092, "auxiliary_loss_mlp": 0.01059151, "balance_loss_clip": 1.0640471, "balance_loss_mlp": 1.03352082, "epoch": 0.06571471516609048, "flos": 24787124910720.0, "grad_norm": 2.268968080418226, "language_loss": 0.71134168, "learning_rate": 3.986635334582814e-06, "loss": 0.73408413, "num_input_tokens_seen": 23283950, "step": 1093, "time_per_iteration": 5.3356239795684814 }, { "auxiliary_loss_clip": 0.01190682, "auxiliary_loss_mlp": 0.01064498, "balance_loss_clip": 1.06751943, "balance_loss_mlp": 1.0392611, "epoch": 0.06577483841875846, "flos": 26214071581440.0, "grad_norm": 3.829837904337144, "language_loss": 0.87996346, "learning_rate": 3.986590348226282e-06, "loss": 0.90251523, "num_input_tokens_seen": 23305005, "step": 1094, "time_per_iteration": 2.853489637374878 }, { "auxiliary_loss_clip": 0.01192742, "auxiliary_loss_mlp": 0.01065068, "balance_loss_clip": 1.06367433, "balance_loss_mlp": 1.03843689, "epoch": 0.06583496167142643, "flos": 25080802508160.0, "grad_norm": 1.6736216436017588, "language_loss": 0.81483954, "learning_rate": 3.986545286538044e-06, "loss": 0.8374176, "num_input_tokens_seen": 23323220, "step": 1095, "time_per_iteration": 5.1613922119140625 }, { "auxiliary_loss_clip": 0.01166049, "auxiliary_loss_mlp": 0.01058945, "balance_loss_clip": 1.06295943, "balance_loss_mlp": 1.03598547, "epoch": 0.06589508492409439, "flos": 25629804956160.0, "grad_norm": 2.0200125290673068, "language_loss": 0.69789279, "learning_rate": 3.986500149519811e-06, "loss": 0.72014272, "num_input_tokens_seen": 23342235, "step": 1096, "time_per_iteration": 2.804025173187256 }, { "auxiliary_loss_clip": 0.01201939, "auxiliary_loss_mlp": 0.01070786, "balance_loss_clip": 1.06405246, "balance_loss_mlp": 1.04614568, "epoch": 0.06595520817676236, "flos": 23621249266560.0, "grad_norm": 1.7011375462517908, "language_loss": 0.77430046, "learning_rate": 3.986454937173292e-06, "loss": 0.79702777, "num_input_tokens_seen": 23363680, "step": 1097, "time_per_iteration": 2.7658958435058594 }, { "auxiliary_loss_clip": 0.01215996, "auxiliary_loss_mlp": 0.01063445, "balance_loss_clip": 1.06707537, "balance_loss_mlp": 1.03959155, "epoch": 0.06601533142943034, "flos": 33801708545280.0, "grad_norm": 1.8316558452843608, "language_loss": 0.78217584, "learning_rate": 3.986409649500203e-06, "loss": 0.80497026, "num_input_tokens_seen": 23385590, "step": 1098, "time_per_iteration": 2.865684747695923 }, { "auxiliary_loss_clip": 0.01197349, "auxiliary_loss_mlp": 0.01069192, "balance_loss_clip": 1.06328607, "balance_loss_mlp": 1.04443276, "epoch": 0.0660754546820983, "flos": 20259184262400.0, "grad_norm": 1.9237510259783663, "language_loss": 0.81525648, "learning_rate": 3.986364286502261e-06, "loss": 0.83792192, "num_input_tokens_seen": 23402945, "step": 1099, "time_per_iteration": 2.690377950668335 }, { "auxiliary_loss_clip": 0.01179995, "auxiliary_loss_mlp": 0.0105819, "balance_loss_clip": 1.0578239, "balance_loss_mlp": 1.03428841, "epoch": 0.06613557793476627, "flos": 19354164163200.0, "grad_norm": 1.9906927310803755, "language_loss": 0.82793295, "learning_rate": 3.986318848181186e-06, "loss": 0.8503148, "num_input_tokens_seen": 23421410, "step": 1100, "time_per_iteration": 2.7613909244537354 }, { "auxiliary_loss_clip": 0.01191263, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.06985724, "balance_loss_mlp": 1.03529549, "epoch": 0.06619570118743424, "flos": 13772568936960.0, "grad_norm": 2.079994286400427, "language_loss": 0.73502243, "learning_rate": 3.986273334538702e-06, "loss": 0.75752538, "num_input_tokens_seen": 23438870, "step": 1101, "time_per_iteration": 2.7795870304107666 }, { "auxiliary_loss_clip": 0.01199256, "auxiliary_loss_mlp": 0.01061171, "balance_loss_clip": 1.06278944, "balance_loss_mlp": 1.03773487, "epoch": 0.06625582444010221, "flos": 17857874286720.0, "grad_norm": 2.875757629612747, "language_loss": 0.85861301, "learning_rate": 3.986227745576533e-06, "loss": 0.88121736, "num_input_tokens_seen": 23456975, "step": 1102, "time_per_iteration": 2.737269401550293 }, { "auxiliary_loss_clip": 0.01191982, "auxiliary_loss_mlp": 0.01058639, "balance_loss_clip": 1.06898165, "balance_loss_mlp": 1.03410578, "epoch": 0.06631594769277017, "flos": 11838707579520.0, "grad_norm": 2.8924251757501778, "language_loss": 0.81655926, "learning_rate": 3.98618208129641e-06, "loss": 0.83906543, "num_input_tokens_seen": 23473440, "step": 1103, "time_per_iteration": 2.9345293045043945 }, { "auxiliary_loss_clip": 0.01203522, "auxiliary_loss_mlp": 0.00780451, "balance_loss_clip": 1.06721628, "balance_loss_mlp": 1.00042021, "epoch": 0.06637607094543815, "flos": 19793351756160.0, "grad_norm": 5.176370819061919, "language_loss": 0.81749105, "learning_rate": 3.986136341700063e-06, "loss": 0.83733076, "num_input_tokens_seen": 23493880, "step": 1104, "time_per_iteration": 2.753657102584839 }, { "auxiliary_loss_clip": 0.0116508, "auxiliary_loss_mlp": 0.01050687, "balance_loss_clip": 1.0576005, "balance_loss_mlp": 1.02608228, "epoch": 0.06643619419810612, "flos": 25485659677440.0, "grad_norm": 1.5448539486038575, "language_loss": 0.80422902, "learning_rate": 3.986090526789227e-06, "loss": 0.82638663, "num_input_tokens_seen": 23514920, "step": 1105, "time_per_iteration": 2.8904521465301514 }, { "auxiliary_loss_clip": 0.01179397, "auxiliary_loss_mlp": 0.0106197, "balance_loss_clip": 1.06348729, "balance_loss_mlp": 1.0391891, "epoch": 0.06649631745077408, "flos": 16946533393920.0, "grad_norm": 2.7426455725749896, "language_loss": 0.96762037, "learning_rate": 3.986044636565639e-06, "loss": 0.99003398, "num_input_tokens_seen": 23531635, "step": 1106, "time_per_iteration": 2.890073299407959 }, { "auxiliary_loss_clip": 0.01198065, "auxiliary_loss_mlp": 0.01059975, "balance_loss_clip": 1.06069684, "balance_loss_mlp": 1.03511953, "epoch": 0.06655644070344206, "flos": 17858592558720.0, "grad_norm": 1.9297768479693453, "language_loss": 0.82528949, "learning_rate": 3.985998671031039e-06, "loss": 0.84786987, "num_input_tokens_seen": 23551020, "step": 1107, "time_per_iteration": 2.778857469558716 }, { "auxiliary_loss_clip": 0.01104176, "auxiliary_loss_mlp": 0.01010935, "balance_loss_clip": 1.04708242, "balance_loss_mlp": 1.0072155, "epoch": 0.06661656395611003, "flos": 61419350021760.0, "grad_norm": 0.7967940032222198, "language_loss": 0.56789279, "learning_rate": 3.9859526301871705e-06, "loss": 0.58904392, "num_input_tokens_seen": 23610675, "step": 1108, "time_per_iteration": 3.2717819213867188 }, { "auxiliary_loss_clip": 0.0118327, "auxiliary_loss_mlp": 0.01062625, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.0376507, "epoch": 0.066676687208778, "flos": 20662856282880.0, "grad_norm": 2.682842555407744, "language_loss": 0.7287578, "learning_rate": 3.9859065140357795e-06, "loss": 0.75121677, "num_input_tokens_seen": 23628710, "step": 1109, "time_per_iteration": 2.829623222351074 }, { "auxiliary_loss_clip": 0.01148971, "auxiliary_loss_mlp": 0.01071895, "balance_loss_clip": 1.05459642, "balance_loss_mlp": 1.04714715, "epoch": 0.06673681046144596, "flos": 20923280864640.0, "grad_norm": 1.7914435942805436, "language_loss": 0.78140426, "learning_rate": 3.985860322578614e-06, "loss": 0.80361295, "num_input_tokens_seen": 23649160, "step": 1110, "time_per_iteration": 2.892786741256714 }, { "auxiliary_loss_clip": 0.01153553, "auxiliary_loss_mlp": 0.0106147, "balance_loss_clip": 1.05590594, "balance_loss_mlp": 1.03700781, "epoch": 0.06679693371411394, "flos": 31065818359680.0, "grad_norm": 2.5260725451831805, "language_loss": 0.71425366, "learning_rate": 3.985814055817427e-06, "loss": 0.73640382, "num_input_tokens_seen": 23671995, "step": 1111, "time_per_iteration": 2.9349052906036377 }, { "auxiliary_loss_clip": 0.01170538, "auxiliary_loss_mlp": 0.01066103, "balance_loss_clip": 1.05776191, "balance_loss_mlp": 1.04199934, "epoch": 0.0668570569667819, "flos": 21726135705600.0, "grad_norm": 1.8396663794990693, "language_loss": 0.78767776, "learning_rate": 3.985767713753971e-06, "loss": 0.81004417, "num_input_tokens_seen": 23690705, "step": 1112, "time_per_iteration": 2.8676345348358154 }, { "auxiliary_loss_clip": 0.01153291, "auxiliary_loss_mlp": 0.01065421, "balance_loss_clip": 1.05340791, "balance_loss_mlp": 1.04163861, "epoch": 0.06691718021944987, "flos": 22747255539840.0, "grad_norm": 2.071048188460824, "language_loss": 0.78481978, "learning_rate": 3.985721296390005e-06, "loss": 0.80700684, "num_input_tokens_seen": 23709990, "step": 1113, "time_per_iteration": 2.8688411712646484 }, { "auxiliary_loss_clip": 0.0114872, "auxiliary_loss_mlp": 0.01057074, "balance_loss_clip": 1.05157375, "balance_loss_mlp": 1.03376842, "epoch": 0.06697730347211785, "flos": 16545626720640.0, "grad_norm": 1.7560007918285245, "language_loss": 0.82399213, "learning_rate": 3.985674803727289e-06, "loss": 0.84605002, "num_input_tokens_seen": 23728485, "step": 1114, "time_per_iteration": 2.832458019256592 }, { "auxiliary_loss_clip": 0.01075626, "auxiliary_loss_mlp": 0.01006906, "balance_loss_clip": 1.04995251, "balance_loss_mlp": 1.00271022, "epoch": 0.06703742672478581, "flos": 59782326658560.0, "grad_norm": 0.8370646888074905, "language_loss": 0.58147323, "learning_rate": 3.985628235767584e-06, "loss": 0.60229862, "num_input_tokens_seen": 23786650, "step": 1115, "time_per_iteration": 3.550837755203247 }, { "auxiliary_loss_clip": 0.01177193, "auxiliary_loss_mlp": 0.01059174, "balance_loss_clip": 1.05986214, "balance_loss_mlp": 1.03381801, "epoch": 0.06709754997745378, "flos": 16800197385600.0, "grad_norm": 2.8944873563712235, "language_loss": 0.91280693, "learning_rate": 3.985581592512658e-06, "loss": 0.93517065, "num_input_tokens_seen": 23802555, "step": 1116, "time_per_iteration": 2.994608163833618 }, { "auxiliary_loss_clip": 0.01169376, "auxiliary_loss_mlp": 0.0078227, "balance_loss_clip": 1.05839634, "balance_loss_mlp": 1.00045347, "epoch": 0.06715767323012176, "flos": 22123917895680.0, "grad_norm": 1.9249158333763592, "language_loss": 0.87154609, "learning_rate": 3.985534873964279e-06, "loss": 0.89106256, "num_input_tokens_seen": 23822945, "step": 1117, "time_per_iteration": 2.794400453567505 }, { "auxiliary_loss_clip": 0.01095782, "auxiliary_loss_mlp": 0.01003785, "balance_loss_clip": 1.0387876, "balance_loss_mlp": 0.99963647, "epoch": 0.06721779648278972, "flos": 66618100137600.0, "grad_norm": 0.8644388721740246, "language_loss": 0.5981611, "learning_rate": 3.985488080124218e-06, "loss": 0.61915678, "num_input_tokens_seen": 23874075, "step": 1118, "time_per_iteration": 3.1695809364318848 }, { "auxiliary_loss_clip": 0.01178972, "auxiliary_loss_mlp": 0.01051993, "balance_loss_clip": 1.05301392, "balance_loss_mlp": 1.02780545, "epoch": 0.06727791973545769, "flos": 22382474970240.0, "grad_norm": 3.6923711141076447, "language_loss": 0.83045954, "learning_rate": 3.985441210994251e-06, "loss": 0.85276914, "num_input_tokens_seen": 23889720, "step": 1119, "time_per_iteration": 2.7538814544677734 }, { "auxiliary_loss_clip": 0.01182384, "auxiliary_loss_mlp": 0.01058422, "balance_loss_clip": 1.06102347, "balance_loss_mlp": 1.03566504, "epoch": 0.06733804298812565, "flos": 24280210224000.0, "grad_norm": 4.541743494234462, "language_loss": 0.8451674, "learning_rate": 3.9853942665761545e-06, "loss": 0.86757541, "num_input_tokens_seen": 23909385, "step": 1120, "time_per_iteration": 2.76581072807312 }, { "auxiliary_loss_clip": 0.0121565, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.06757379, "balance_loss_mlp": 1.04028773, "epoch": 0.06739816624079363, "flos": 15918230839680.0, "grad_norm": 2.503866645162978, "language_loss": 0.78722781, "learning_rate": 3.985347246871708e-06, "loss": 0.81003344, "num_input_tokens_seen": 23926830, "step": 1121, "time_per_iteration": 2.651175022125244 }, { "auxiliary_loss_clip": 0.01080914, "auxiliary_loss_mlp": 0.01011889, "balance_loss_clip": 1.03108025, "balance_loss_mlp": 1.00802636, "epoch": 0.0674582894934616, "flos": 71398567353600.0, "grad_norm": 0.7540288133642103, "language_loss": 0.58320796, "learning_rate": 3.985300151882694e-06, "loss": 0.60413599, "num_input_tokens_seen": 23992640, "step": 1122, "time_per_iteration": 3.3794541358947754 }, { "auxiliary_loss_clip": 0.01145486, "auxiliary_loss_mlp": 0.01066136, "balance_loss_clip": 1.05581403, "balance_loss_mlp": 1.04167438, "epoch": 0.06751841274612956, "flos": 25264952559360.0, "grad_norm": 2.3361170394687076, "language_loss": 0.71965349, "learning_rate": 3.985252981610901e-06, "loss": 0.74176967, "num_input_tokens_seen": 24011135, "step": 1123, "time_per_iteration": 2.8049354553222656 }, { "auxiliary_loss_clip": 0.01144994, "auxiliary_loss_mlp": 0.01064196, "balance_loss_clip": 1.05373979, "balance_loss_mlp": 1.03612232, "epoch": 0.06757853599879754, "flos": 23802741711360.0, "grad_norm": 1.7380479869896208, "language_loss": 0.78987843, "learning_rate": 3.985205736058114e-06, "loss": 0.81197035, "num_input_tokens_seen": 24030695, "step": 1124, "time_per_iteration": 2.8595056533813477 }, { "auxiliary_loss_clip": 0.01189686, "auxiliary_loss_mlp": 0.01055169, "balance_loss_clip": 1.05663013, "balance_loss_mlp": 1.03200674, "epoch": 0.0676386592514655, "flos": 21033742164480.0, "grad_norm": 3.1450673626590793, "language_loss": 0.70999855, "learning_rate": 3.985158415226128e-06, "loss": 0.73244709, "num_input_tokens_seen": 24050680, "step": 1125, "time_per_iteration": 2.726163625717163 }, { "auxiliary_loss_clip": 0.01165518, "auxiliary_loss_mlp": 0.01068918, "balance_loss_clip": 1.05826426, "balance_loss_mlp": 1.04290628, "epoch": 0.06769878250413347, "flos": 25556331686400.0, "grad_norm": 3.340323364887528, "language_loss": 0.81440383, "learning_rate": 3.985111019116736e-06, "loss": 0.83674812, "num_input_tokens_seen": 24067205, "step": 1126, "time_per_iteration": 2.7356598377227783 }, { "auxiliary_loss_clip": 0.0107201, "auxiliary_loss_mlp": 0.01004999, "balance_loss_clip": 1.0293622, "balance_loss_mlp": 1.00092208, "epoch": 0.06775890575680145, "flos": 70655251305600.0, "grad_norm": 0.77802311726495, "language_loss": 0.59720373, "learning_rate": 3.985063547731735e-06, "loss": 0.6179738, "num_input_tokens_seen": 24131320, "step": 1127, "time_per_iteration": 3.2627320289611816 }, { "auxiliary_loss_clip": 0.01206438, "auxiliary_loss_mlp": 0.01055509, "balance_loss_clip": 1.06308687, "balance_loss_mlp": 1.03189397, "epoch": 0.06781902900946941, "flos": 24235500769920.0, "grad_norm": 2.2535941175889054, "language_loss": 0.81097019, "learning_rate": 3.985016001072925e-06, "loss": 0.83358967, "num_input_tokens_seen": 24149930, "step": 1128, "time_per_iteration": 2.6652371883392334 }, { "auxiliary_loss_clip": 0.01158345, "auxiliary_loss_mlp": 0.01052658, "balance_loss_clip": 1.05360484, "balance_loss_mlp": 1.02804112, "epoch": 0.06787915226213738, "flos": 22417523665920.0, "grad_norm": 2.24200367657907, "language_loss": 0.75559127, "learning_rate": 3.984968379142109e-06, "loss": 0.77770138, "num_input_tokens_seen": 24169590, "step": 1129, "time_per_iteration": 2.7023732662200928 }, { "auxiliary_loss_clip": 0.01117595, "auxiliary_loss_mlp": 0.01053995, "balance_loss_clip": 1.04627228, "balance_loss_mlp": 1.03006983, "epoch": 0.06793927551480534, "flos": 37706922080640.0, "grad_norm": 1.890559803272908, "language_loss": 0.71710479, "learning_rate": 3.984920681941094e-06, "loss": 0.73882067, "num_input_tokens_seen": 24189965, "step": 1130, "time_per_iteration": 3.0757689476013184 }, { "auxiliary_loss_clip": 0.01158117, "auxiliary_loss_mlp": 0.010592, "balance_loss_clip": 1.05734515, "balance_loss_mlp": 1.03481019, "epoch": 0.06799939876747332, "flos": 20631398947200.0, "grad_norm": 2.24421862356218, "language_loss": 0.80776262, "learning_rate": 3.984872909471688e-06, "loss": 0.82993579, "num_input_tokens_seen": 24208045, "step": 1131, "time_per_iteration": 5.00832724571228 }, { "auxiliary_loss_clip": 0.01195331, "auxiliary_loss_mlp": 0.01070142, "balance_loss_clip": 1.06155944, "balance_loss_mlp": 1.04614532, "epoch": 0.06805952202014129, "flos": 14864755829760.0, "grad_norm": 2.0533244923502463, "language_loss": 0.80371779, "learning_rate": 3.984825061735701e-06, "loss": 0.8263725, "num_input_tokens_seen": 24223805, "step": 1132, "time_per_iteration": 4.487931251525879 }, { "auxiliary_loss_clip": 0.01170581, "auxiliary_loss_mlp": 0.01061867, "balance_loss_clip": 1.05438542, "balance_loss_mlp": 1.03756022, "epoch": 0.06811964527280925, "flos": 48909434947200.0, "grad_norm": 1.7182324226465766, "language_loss": 0.6341064, "learning_rate": 3.9847771387349495e-06, "loss": 0.65643084, "num_input_tokens_seen": 24249475, "step": 1133, "time_per_iteration": 4.48089337348938 }, { "auxiliary_loss_clip": 0.01125599, "auxiliary_loss_mlp": 0.01055984, "balance_loss_clip": 1.04700482, "balance_loss_mlp": 1.02973366, "epoch": 0.06817976852547723, "flos": 15377273038080.0, "grad_norm": 1.9264963116598819, "language_loss": 0.74771935, "learning_rate": 3.9847291404712506e-06, "loss": 0.76953518, "num_input_tokens_seen": 24267980, "step": 1134, "time_per_iteration": 5.287277936935425 }, { "auxiliary_loss_clip": 0.01169269, "auxiliary_loss_mlp": 0.00782536, "balance_loss_clip": 1.05878353, "balance_loss_mlp": 1.00042605, "epoch": 0.0682398917781452, "flos": 20155690200960.0, "grad_norm": 2.151108605399924, "language_loss": 0.86871451, "learning_rate": 3.984681066946423e-06, "loss": 0.88823259, "num_input_tokens_seen": 24286805, "step": 1135, "time_per_iteration": 2.8024110794067383 }, { "auxiliary_loss_clip": 0.0117656, "auxiliary_loss_mlp": 0.007818, "balance_loss_clip": 1.0543226, "balance_loss_mlp": 1.00046515, "epoch": 0.06830001503081316, "flos": 23440618748160.0, "grad_norm": 2.521942237810997, "language_loss": 0.78131735, "learning_rate": 3.984632918162291e-06, "loss": 0.80090094, "num_input_tokens_seen": 24305855, "step": 1136, "time_per_iteration": 2.7595040798187256 }, { "auxiliary_loss_clip": 0.01185832, "auxiliary_loss_mlp": 0.01063587, "balance_loss_clip": 1.05952621, "balance_loss_mlp": 1.03868449, "epoch": 0.06836013828348114, "flos": 34349813153280.0, "grad_norm": 2.275643110468061, "language_loss": 0.83968467, "learning_rate": 3.984584694120679e-06, "loss": 0.86217892, "num_input_tokens_seen": 24326535, "step": 1137, "time_per_iteration": 2.7738285064697266 }, { "auxiliary_loss_clip": 0.01153105, "auxiliary_loss_mlp": 0.01059471, "balance_loss_clip": 1.05239427, "balance_loss_mlp": 1.0348897, "epoch": 0.06842026153614911, "flos": 23148844571520.0, "grad_norm": 2.068206081593879, "language_loss": 0.788486, "learning_rate": 3.984536394823418e-06, "loss": 0.81061178, "num_input_tokens_seen": 24345810, "step": 1138, "time_per_iteration": 2.804537296295166 }, { "auxiliary_loss_clip": 0.01209658, "auxiliary_loss_mlp": 0.01058353, "balance_loss_clip": 1.06288362, "balance_loss_mlp": 1.03415346, "epoch": 0.06848038478881707, "flos": 24608972430720.0, "grad_norm": 2.3335265924104096, "language_loss": 0.85507643, "learning_rate": 3.984488020272336e-06, "loss": 0.87775654, "num_input_tokens_seen": 24366095, "step": 1139, "time_per_iteration": 2.746884822845459 }, { "auxiliary_loss_clip": 0.01153855, "auxiliary_loss_mlp": 0.01063721, "balance_loss_clip": 1.05325532, "balance_loss_mlp": 1.03679228, "epoch": 0.06854050804148504, "flos": 40880994278400.0, "grad_norm": 1.9254794009430078, "language_loss": 0.74899161, "learning_rate": 3.984439570469271e-06, "loss": 0.7711674, "num_input_tokens_seen": 24388665, "step": 1140, "time_per_iteration": 2.938143253326416 }, { "auxiliary_loss_clip": 0.01186218, "auxiliary_loss_mlp": 0.00782227, "balance_loss_clip": 1.06101704, "balance_loss_mlp": 1.00036597, "epoch": 0.06860063129415302, "flos": 31686354743040.0, "grad_norm": 2.1250887020504767, "language_loss": 0.68258876, "learning_rate": 3.9843910454160574e-06, "loss": 0.70227319, "num_input_tokens_seen": 24407705, "step": 1141, "time_per_iteration": 2.8180530071258545 }, { "auxiliary_loss_clip": 0.01197117, "auxiliary_loss_mlp": 0.01067748, "balance_loss_clip": 1.05978489, "balance_loss_mlp": 1.04266596, "epoch": 0.06866075454682098, "flos": 26542007775360.0, "grad_norm": 1.8460768582410394, "language_loss": 0.78959155, "learning_rate": 3.984342445114538e-06, "loss": 0.81224018, "num_input_tokens_seen": 24428390, "step": 1142, "time_per_iteration": 2.712876558303833 }, { "auxiliary_loss_clip": 0.01186915, "auxiliary_loss_mlp": 0.01060882, "balance_loss_clip": 1.06245089, "balance_loss_mlp": 1.03702831, "epoch": 0.06872087779948895, "flos": 29789768724480.0, "grad_norm": 1.7867268614306446, "language_loss": 0.68287402, "learning_rate": 3.984293769566553e-06, "loss": 0.70535195, "num_input_tokens_seen": 24450810, "step": 1143, "time_per_iteration": 2.752659320831299 }, { "auxiliary_loss_clip": 0.01177843, "auxiliary_loss_mlp": 0.01059894, "balance_loss_clip": 1.05798244, "balance_loss_mlp": 1.03773308, "epoch": 0.06878100105215693, "flos": 26941118768640.0, "grad_norm": 1.7582250309313294, "language_loss": 0.74307454, "learning_rate": 3.98424501877395e-06, "loss": 0.76545191, "num_input_tokens_seen": 24469965, "step": 1144, "time_per_iteration": 2.6448662281036377 }, { "auxiliary_loss_clip": 0.01189197, "auxiliary_loss_mlp": 0.0106544, "balance_loss_clip": 1.0565474, "balance_loss_mlp": 1.04039407, "epoch": 0.06884112430482489, "flos": 10670748946560.0, "grad_norm": 2.699041414372958, "language_loss": 0.91755033, "learning_rate": 3.984196192738577e-06, "loss": 0.94009674, "num_input_tokens_seen": 24486370, "step": 1145, "time_per_iteration": 2.6621482372283936 }, { "auxiliary_loss_clip": 0.01212189, "auxiliary_loss_mlp": 0.0106819, "balance_loss_clip": 1.06225932, "balance_loss_mlp": 1.04258406, "epoch": 0.06890124755749286, "flos": 20193647898240.0, "grad_norm": 2.2014676012481487, "language_loss": 0.81726635, "learning_rate": 3.984147291462285e-06, "loss": 0.84007025, "num_input_tokens_seen": 24503780, "step": 1146, "time_per_iteration": 2.623964548110962 }, { "auxiliary_loss_clip": 0.01204602, "auxiliary_loss_mlp": 0.01065301, "balance_loss_clip": 1.06215203, "balance_loss_mlp": 1.04191244, "epoch": 0.06896137081016084, "flos": 20449224144000.0, "grad_norm": 2.1265245828428108, "language_loss": 0.84968954, "learning_rate": 3.98409831494693e-06, "loss": 0.8723886, "num_input_tokens_seen": 24522320, "step": 1147, "time_per_iteration": 2.5898265838623047 }, { "auxiliary_loss_clip": 0.01156886, "auxiliary_loss_mlp": 0.01064453, "balance_loss_clip": 1.05563867, "balance_loss_mlp": 1.03949046, "epoch": 0.0690214940628288, "flos": 18368703555840.0, "grad_norm": 1.7557033260323716, "language_loss": 0.86094105, "learning_rate": 3.984049263194367e-06, "loss": 0.88315445, "num_input_tokens_seen": 24540445, "step": 1148, "time_per_iteration": 2.748782157897949 }, { "auxiliary_loss_clip": 0.01173365, "auxiliary_loss_mlp": 0.01060047, "balance_loss_clip": 1.05569541, "balance_loss_mlp": 1.03370178, "epoch": 0.06908161731549677, "flos": 20558033418240.0, "grad_norm": 2.322434023005448, "language_loss": 0.69602191, "learning_rate": 3.9840001362064575e-06, "loss": 0.71835601, "num_input_tokens_seen": 24557105, "step": 1149, "time_per_iteration": 2.741854429244995 }, { "auxiliary_loss_clip": 0.01207871, "auxiliary_loss_mlp": 0.01051245, "balance_loss_clip": 1.06034219, "balance_loss_mlp": 1.02692604, "epoch": 0.06914174056816474, "flos": 27563666313600.0, "grad_norm": 1.9440351937259064, "language_loss": 0.8374452, "learning_rate": 3.983950933985064e-06, "loss": 0.86003637, "num_input_tokens_seen": 24578240, "step": 1150, "time_per_iteration": 2.6919586658477783 }, { "auxiliary_loss_clip": 0.01181406, "auxiliary_loss_mlp": 0.01058015, "balance_loss_clip": 1.06063652, "balance_loss_mlp": 1.03380394, "epoch": 0.06920186382083271, "flos": 15304015249920.0, "grad_norm": 4.11905785776886, "language_loss": 0.81464434, "learning_rate": 3.983901656532052e-06, "loss": 0.83703858, "num_input_tokens_seen": 24593585, "step": 1151, "time_per_iteration": 2.7979934215545654 }, { "auxiliary_loss_clip": 0.01206831, "auxiliary_loss_mlp": 0.01058184, "balance_loss_clip": 1.06409955, "balance_loss_mlp": 1.03434169, "epoch": 0.06926198707350067, "flos": 25191227894400.0, "grad_norm": 2.0324362571668724, "language_loss": 0.85408235, "learning_rate": 3.983852303849291e-06, "loss": 0.87673247, "num_input_tokens_seen": 24613110, "step": 1152, "time_per_iteration": 2.686021089553833 }, { "auxiliary_loss_clip": 0.01190935, "auxiliary_loss_mlp": 0.01062076, "balance_loss_clip": 1.06250155, "balance_loss_mlp": 1.03866374, "epoch": 0.06932211032616864, "flos": 13256137146240.0, "grad_norm": 2.182544196511779, "language_loss": 0.90594423, "learning_rate": 3.983802875938651e-06, "loss": 0.92847437, "num_input_tokens_seen": 24628795, "step": 1153, "time_per_iteration": 2.58366060256958 }, { "auxiliary_loss_clip": 0.01169877, "auxiliary_loss_mlp": 0.01055253, "balance_loss_clip": 1.05681062, "balance_loss_mlp": 1.03088629, "epoch": 0.06938223357883662, "flos": 24827381078400.0, "grad_norm": 2.1214794624630846, "language_loss": 0.81526846, "learning_rate": 3.983753372802008e-06, "loss": 0.83751976, "num_input_tokens_seen": 24645480, "step": 1154, "time_per_iteration": 2.696794271469116 }, { "auxiliary_loss_clip": 0.01188774, "auxiliary_loss_mlp": 0.01066335, "balance_loss_clip": 1.0691216, "balance_loss_mlp": 1.04200506, "epoch": 0.06944235683150458, "flos": 27267977554560.0, "grad_norm": 2.102018399986892, "language_loss": 0.75022292, "learning_rate": 3.983703794441237e-06, "loss": 0.77277398, "num_input_tokens_seen": 24664630, "step": 1155, "time_per_iteration": 2.7718143463134766 }, { "auxiliary_loss_clip": 0.01180696, "auxiliary_loss_mlp": 0.00782152, "balance_loss_clip": 1.05586052, "balance_loss_mlp": 1.00041056, "epoch": 0.06950248008417255, "flos": 25808065176960.0, "grad_norm": 1.7459449483933205, "language_loss": 0.7110405, "learning_rate": 3.98365414085822e-06, "loss": 0.73066902, "num_input_tokens_seen": 24684210, "step": 1156, "time_per_iteration": 2.7014200687408447 }, { "auxiliary_loss_clip": 0.01179101, "auxiliary_loss_mlp": 0.00782674, "balance_loss_clip": 1.0593586, "balance_loss_mlp": 1.00037348, "epoch": 0.06956260333684053, "flos": 22271546793600.0, "grad_norm": 2.067241397655847, "language_loss": 0.74882817, "learning_rate": 3.98360441205484e-06, "loss": 0.76844591, "num_input_tokens_seen": 24702490, "step": 1157, "time_per_iteration": 2.7571897506713867 }, { "auxiliary_loss_clip": 0.01178249, "auxiliary_loss_mlp": 0.01061737, "balance_loss_clip": 1.05653787, "balance_loss_mlp": 1.03697729, "epoch": 0.0696227265895085, "flos": 29681390413440.0, "grad_norm": 1.9827644507913538, "language_loss": 0.7165724, "learning_rate": 3.983554608032982e-06, "loss": 0.73897225, "num_input_tokens_seen": 24724340, "step": 1158, "time_per_iteration": 2.839745044708252 }, { "auxiliary_loss_clip": 0.01207855, "auxiliary_loss_mlp": 0.01058558, "balance_loss_clip": 1.0605582, "balance_loss_mlp": 1.03370285, "epoch": 0.06968284984217646, "flos": 25523545547520.0, "grad_norm": 1.9692207215605615, "language_loss": 0.79595017, "learning_rate": 3.983504728794533e-06, "loss": 0.8186143, "num_input_tokens_seen": 24745550, "step": 1159, "time_per_iteration": 2.7535817623138428 }, { "auxiliary_loss_clip": 0.01212717, "auxiliary_loss_mlp": 0.01068535, "balance_loss_clip": 1.06535673, "balance_loss_mlp": 1.04094958, "epoch": 0.06974297309484444, "flos": 20698192287360.0, "grad_norm": 3.5530789367722373, "language_loss": 0.80517769, "learning_rate": 3.983454774341387e-06, "loss": 0.82799017, "num_input_tokens_seen": 24762575, "step": 1160, "time_per_iteration": 2.7455785274505615 }, { "auxiliary_loss_clip": 0.0119075, "auxiliary_loss_mlp": 0.01057887, "balance_loss_clip": 1.05680609, "balance_loss_mlp": 1.03294837, "epoch": 0.0698030963475124, "flos": 26505199313280.0, "grad_norm": 1.6303409062485206, "language_loss": 0.7607069, "learning_rate": 3.983404744675437e-06, "loss": 0.78319323, "num_input_tokens_seen": 24782605, "step": 1161, "time_per_iteration": 2.773775100708008 }, { "auxiliary_loss_clip": 0.01175787, "auxiliary_loss_mlp": 0.01062083, "balance_loss_clip": 1.05773759, "balance_loss_mlp": 1.03673923, "epoch": 0.06986321960018037, "flos": 23040430346880.0, "grad_norm": 1.6605796421434038, "language_loss": 0.82758528, "learning_rate": 3.9833546397985794e-06, "loss": 0.84996402, "num_input_tokens_seen": 24802910, "step": 1162, "time_per_iteration": 2.7426044940948486 }, { "auxiliary_loss_clip": 0.01182513, "auxiliary_loss_mlp": 0.01058124, "balance_loss_clip": 1.05717576, "balance_loss_mlp": 1.03092098, "epoch": 0.06992334285284833, "flos": 28584822061440.0, "grad_norm": 1.9523155091610094, "language_loss": 0.79563475, "learning_rate": 3.983304459712716e-06, "loss": 0.81804121, "num_input_tokens_seen": 24823305, "step": 1163, "time_per_iteration": 2.720947742462158 }, { "auxiliary_loss_clip": 0.01190519, "auxiliary_loss_mlp": 0.01063375, "balance_loss_clip": 1.05861616, "balance_loss_mlp": 1.03722012, "epoch": 0.06998346610551631, "flos": 20595344670720.0, "grad_norm": 2.213365660843382, "language_loss": 0.79187214, "learning_rate": 3.983254204419749e-06, "loss": 0.81441104, "num_input_tokens_seen": 24842155, "step": 1164, "time_per_iteration": 2.6554183959960938 }, { "auxiliary_loss_clip": 0.01143916, "auxiliary_loss_mlp": 0.01067459, "balance_loss_clip": 1.05240798, "balance_loss_mlp": 1.03875315, "epoch": 0.07004358935818428, "flos": 22528810978560.0, "grad_norm": 1.421930435008642, "language_loss": 0.72855628, "learning_rate": 3.983203873921583e-06, "loss": 0.75067008, "num_input_tokens_seen": 24862080, "step": 1165, "time_per_iteration": 2.753063440322876 }, { "auxiliary_loss_clip": 0.01183824, "auxiliary_loss_mlp": 0.01059612, "balance_loss_clip": 1.06135893, "balance_loss_mlp": 1.03522193, "epoch": 0.07010371261085224, "flos": 28949997680640.0, "grad_norm": 2.453348821242437, "language_loss": 0.81136239, "learning_rate": 3.983153468220128e-06, "loss": 0.83379674, "num_input_tokens_seen": 24886165, "step": 1166, "time_per_iteration": 2.802016496658325 }, { "auxiliary_loss_clip": 0.011718, "auxiliary_loss_mlp": 0.01053529, "balance_loss_clip": 1.05450797, "balance_loss_mlp": 1.02754176, "epoch": 0.07016383586352022, "flos": 23659171050240.0, "grad_norm": 2.457667377154448, "language_loss": 0.84640259, "learning_rate": 3.983102987317295e-06, "loss": 0.86865586, "num_input_tokens_seen": 24905775, "step": 1167, "time_per_iteration": 2.7066097259521484 }, { "auxiliary_loss_clip": 0.01193446, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.06136739, "balance_loss_mlp": 1.03887713, "epoch": 0.07022395911618819, "flos": 19792130693760.0, "grad_norm": 2.6158204436543, "language_loss": 0.89524722, "learning_rate": 3.983052431214997e-06, "loss": 0.91782373, "num_input_tokens_seen": 24924295, "step": 1168, "time_per_iteration": 2.6258392333984375 }, { "auxiliary_loss_clip": 0.01190821, "auxiliary_loss_mlp": 0.01065905, "balance_loss_clip": 1.06090224, "balance_loss_mlp": 1.03705645, "epoch": 0.07028408236885615, "flos": 21689147675520.0, "grad_norm": 2.6445150319591035, "language_loss": 0.89008862, "learning_rate": 3.983001799915153e-06, "loss": 0.91265589, "num_input_tokens_seen": 24943210, "step": 1169, "time_per_iteration": 2.6858527660369873 }, { "auxiliary_loss_clip": 0.01211063, "auxiliary_loss_mlp": 0.01065533, "balance_loss_clip": 1.06400895, "balance_loss_mlp": 1.03950977, "epoch": 0.07034420562152413, "flos": 25630271832960.0, "grad_norm": 1.9672897290124218, "language_loss": 0.83834457, "learning_rate": 3.982951093419681e-06, "loss": 0.86111057, "num_input_tokens_seen": 24960360, "step": 1170, "time_per_iteration": 2.6278069019317627 }, { "auxiliary_loss_clip": 0.01180333, "auxiliary_loss_mlp": 0.00782328, "balance_loss_clip": 1.0613637, "balance_loss_mlp": 1.00041986, "epoch": 0.0704043288741921, "flos": 20810449267200.0, "grad_norm": 1.8542795171503503, "language_loss": 0.75687242, "learning_rate": 3.982900311730506e-06, "loss": 0.77649903, "num_input_tokens_seen": 24978290, "step": 1171, "time_per_iteration": 5.806530475616455 }, { "auxiliary_loss_clip": 0.01179645, "auxiliary_loss_mlp": 0.0106394, "balance_loss_clip": 1.06133175, "balance_loss_mlp": 1.03919196, "epoch": 0.07046445212686006, "flos": 25593176062080.0, "grad_norm": 2.482864122539831, "language_loss": 0.88865125, "learning_rate": 3.9828494548495514e-06, "loss": 0.91108704, "num_input_tokens_seen": 24997055, "step": 1172, "time_per_iteration": 4.371561288833618 }, { "auxiliary_loss_clip": 0.01197698, "auxiliary_loss_mlp": 0.01054991, "balance_loss_clip": 1.06532764, "balance_loss_mlp": 1.02858603, "epoch": 0.07052457537952803, "flos": 25556978131200.0, "grad_norm": 1.6816354314161714, "language_loss": 0.82075119, "learning_rate": 3.982798522778748e-06, "loss": 0.84327805, "num_input_tokens_seen": 25017490, "step": 1173, "time_per_iteration": 4.611542463302612 }, { "auxiliary_loss_clip": 0.01200886, "auxiliary_loss_mlp": 0.01060851, "balance_loss_clip": 1.06317592, "balance_loss_mlp": 1.03503036, "epoch": 0.070584698632196, "flos": 17968515154560.0, "grad_norm": 2.007232853627583, "language_loss": 0.82071686, "learning_rate": 3.9827475155200245e-06, "loss": 0.8433342, "num_input_tokens_seen": 25035660, "step": 1174, "time_per_iteration": 2.6334969997406006 }, { "auxiliary_loss_clip": 0.01180907, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.05857778, "balance_loss_mlp": 1.03473568, "epoch": 0.07064482188486397, "flos": 25370888745600.0, "grad_norm": 2.09222115072597, "language_loss": 0.85013211, "learning_rate": 3.982696433075317e-06, "loss": 0.87254095, "num_input_tokens_seen": 25054785, "step": 1175, "time_per_iteration": 2.861591339111328 }, { "auxiliary_loss_clip": 0.01196955, "auxiliary_loss_mlp": 0.01069941, "balance_loss_clip": 1.06447482, "balance_loss_mlp": 1.04605186, "epoch": 0.07070494513753194, "flos": 24899848767360.0, "grad_norm": 1.7270820646539309, "language_loss": 0.83103871, "learning_rate": 3.982645275446563e-06, "loss": 0.85370767, "num_input_tokens_seen": 25075180, "step": 1176, "time_per_iteration": 2.754521608352661 }, { "auxiliary_loss_clip": 0.01152261, "auxiliary_loss_mlp": 0.01062154, "balance_loss_clip": 1.05370057, "balance_loss_mlp": 1.0352838, "epoch": 0.07076506839019991, "flos": 22338447874560.0, "grad_norm": 3.4939498355716996, "language_loss": 0.74409902, "learning_rate": 3.982594042635701e-06, "loss": 0.7662431, "num_input_tokens_seen": 25093035, "step": 1177, "time_per_iteration": 2.692426919937134 }, { "auxiliary_loss_clip": 0.01188551, "auxiliary_loss_mlp": 0.0106394, "balance_loss_clip": 1.06080353, "balance_loss_mlp": 1.03801203, "epoch": 0.07082519164286788, "flos": 18660800954880.0, "grad_norm": 1.8240190288677762, "language_loss": 0.85965598, "learning_rate": 3.982542734644673e-06, "loss": 0.88218087, "num_input_tokens_seen": 25112520, "step": 1178, "time_per_iteration": 2.7197048664093018 }, { "auxiliary_loss_clip": 0.01082521, "auxiliary_loss_mlp": 0.01013999, "balance_loss_clip": 1.03661168, "balance_loss_mlp": 1.01023197, "epoch": 0.07088531489553584, "flos": 63654107610240.0, "grad_norm": 0.8453670789764802, "language_loss": 0.63256603, "learning_rate": 3.982491351475427e-06, "loss": 0.65353125, "num_input_tokens_seen": 25177760, "step": 1179, "time_per_iteration": 3.3419978618621826 }, { "auxiliary_loss_clip": 0.01211274, "auxiliary_loss_mlp": 0.01073372, "balance_loss_clip": 1.06935215, "balance_loss_mlp": 1.04858887, "epoch": 0.07094543814820382, "flos": 21572688804480.0, "grad_norm": 3.2714198066984177, "language_loss": 0.83388901, "learning_rate": 3.98243989312991e-06, "loss": 0.85673553, "num_input_tokens_seen": 25195260, "step": 1180, "time_per_iteration": 2.631992816925049 }, { "auxiliary_loss_clip": 0.01182661, "auxiliary_loss_mlp": 0.01071326, "balance_loss_clip": 1.06119037, "balance_loss_mlp": 1.04624391, "epoch": 0.07100556140087179, "flos": 22089946608000.0, "grad_norm": 2.0409456536886386, "language_loss": 0.88649988, "learning_rate": 3.982388359610074e-06, "loss": 0.90903974, "num_input_tokens_seen": 25212740, "step": 1181, "time_per_iteration": 2.696789264678955 }, { "auxiliary_loss_clip": 0.01180377, "auxiliary_loss_mlp": 0.01070036, "balance_loss_clip": 1.06187141, "balance_loss_mlp": 1.04516935, "epoch": 0.07106568465353975, "flos": 47922286400640.0, "grad_norm": 1.8294049229574356, "language_loss": 0.83244783, "learning_rate": 3.9823367509178725e-06, "loss": 0.85495198, "num_input_tokens_seen": 25236420, "step": 1182, "time_per_iteration": 2.9415605068206787 }, { "auxiliary_loss_clip": 0.01193669, "auxiliary_loss_mlp": 0.01067019, "balance_loss_clip": 1.0641923, "balance_loss_mlp": 1.04150808, "epoch": 0.07112580790620772, "flos": 23440798316160.0, "grad_norm": 3.5892595189310903, "language_loss": 0.79067838, "learning_rate": 3.982285067055262e-06, "loss": 0.81328523, "num_input_tokens_seen": 25255120, "step": 1183, "time_per_iteration": 2.7284862995147705 }, { "auxiliary_loss_clip": 0.01211976, "auxiliary_loss_mlp": 0.01064792, "balance_loss_clip": 1.06126475, "balance_loss_mlp": 1.03866172, "epoch": 0.0711859311588757, "flos": 31868888682240.0, "grad_norm": 2.5463322111759354, "language_loss": 0.788867, "learning_rate": 3.982233308024204e-06, "loss": 0.81163466, "num_input_tokens_seen": 25275150, "step": 1184, "time_per_iteration": 2.7531635761260986 }, { "auxiliary_loss_clip": 0.01152059, "auxiliary_loss_mlp": 0.01062006, "balance_loss_clip": 1.05961919, "balance_loss_mlp": 1.03752065, "epoch": 0.07124605441154366, "flos": 19610315026560.0, "grad_norm": 1.904751850318294, "language_loss": 0.76806915, "learning_rate": 3.98218147382666e-06, "loss": 0.79020983, "num_input_tokens_seen": 25293680, "step": 1185, "time_per_iteration": 2.732539176940918 }, { "auxiliary_loss_clip": 0.01208288, "auxiliary_loss_mlp": 0.01073792, "balance_loss_clip": 1.06328642, "balance_loss_mlp": 1.04903185, "epoch": 0.07130617766421163, "flos": 14684448533760.0, "grad_norm": 2.1301142092644696, "language_loss": 0.65472758, "learning_rate": 3.982129564464596e-06, "loss": 0.67754835, "num_input_tokens_seen": 25310050, "step": 1186, "time_per_iteration": 2.757812261581421 }, { "auxiliary_loss_clip": 0.01195497, "auxiliary_loss_mlp": 0.01057322, "balance_loss_clip": 1.06479859, "balance_loss_mlp": 1.03274107, "epoch": 0.07136630091687961, "flos": 26067915141120.0, "grad_norm": 2.1671481434625894, "language_loss": 0.69743419, "learning_rate": 3.98207757993998e-06, "loss": 0.71996236, "num_input_tokens_seen": 25331020, "step": 1187, "time_per_iteration": 2.746615409851074 }, { "auxiliary_loss_clip": 0.01151827, "auxiliary_loss_mlp": 0.01067347, "balance_loss_clip": 1.05412316, "balance_loss_mlp": 1.04367232, "epoch": 0.07142642416954757, "flos": 15669190869120.0, "grad_norm": 2.8037131445876597, "language_loss": 0.7861973, "learning_rate": 3.9820255202547845e-06, "loss": 0.80838895, "num_input_tokens_seen": 25347875, "step": 1188, "time_per_iteration": 2.738281726837158 }, { "auxiliary_loss_clip": 0.01203626, "auxiliary_loss_mlp": 0.01059966, "balance_loss_clip": 1.06304908, "balance_loss_mlp": 1.03530121, "epoch": 0.07148654742221554, "flos": 19755322231680.0, "grad_norm": 1.8909260147246576, "language_loss": 0.84754103, "learning_rate": 3.981973385410981e-06, "loss": 0.87017697, "num_input_tokens_seen": 25366715, "step": 1189, "time_per_iteration": 2.5770246982574463 }, { "auxiliary_loss_clip": 0.01173135, "auxiliary_loss_mlp": 0.0078213, "balance_loss_clip": 1.06234396, "balance_loss_mlp": 1.00041807, "epoch": 0.07154667067488352, "flos": 23471824688640.0, "grad_norm": 5.212083930118342, "language_loss": 0.76932275, "learning_rate": 3.9819211754105494e-06, "loss": 0.78887534, "num_input_tokens_seen": 25385450, "step": 1190, "time_per_iteration": 2.7057712078094482 }, { "auxiliary_loss_clip": 0.01208346, "auxiliary_loss_mlp": 0.01074705, "balance_loss_clip": 1.06283545, "balance_loss_mlp": 1.04751348, "epoch": 0.07160679392755148, "flos": 18332936588160.0, "grad_norm": 2.5312098602102084, "language_loss": 0.75201792, "learning_rate": 3.981868890255468e-06, "loss": 0.7748484, "num_input_tokens_seen": 25403940, "step": 1191, "time_per_iteration": 2.6071674823760986 }, { "auxiliary_loss_clip": 0.01162268, "auxiliary_loss_mlp": 0.01063437, "balance_loss_clip": 1.0519917, "balance_loss_mlp": 1.03649545, "epoch": 0.07166691718021945, "flos": 17747017937280.0, "grad_norm": 2.470839013019174, "language_loss": 0.74334443, "learning_rate": 3.981816529947719e-06, "loss": 0.76560152, "num_input_tokens_seen": 25420410, "step": 1192, "time_per_iteration": 2.661078453063965 }, { "auxiliary_loss_clip": 0.01202036, "auxiliary_loss_mlp": 0.01054727, "balance_loss_clip": 1.05904579, "balance_loss_mlp": 1.03099298, "epoch": 0.07172704043288743, "flos": 22451925916800.0, "grad_norm": 2.443309122344248, "language_loss": 0.78010541, "learning_rate": 3.9817640944892896e-06, "loss": 0.8026731, "num_input_tokens_seen": 25439415, "step": 1193, "time_per_iteration": 2.5603158473968506 }, { "auxiliary_loss_clip": 0.01186747, "auxiliary_loss_mlp": 0.01059465, "balance_loss_clip": 1.06358278, "balance_loss_mlp": 1.03319085, "epoch": 0.07178716368555539, "flos": 23222210100480.0, "grad_norm": 2.1011663585924585, "language_loss": 0.85497916, "learning_rate": 3.981711583882166e-06, "loss": 0.87744129, "num_input_tokens_seen": 25458715, "step": 1194, "time_per_iteration": 2.6819851398468018 }, { "auxiliary_loss_clip": 0.01184191, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.05706751, "balance_loss_mlp": 1.04135609, "epoch": 0.07184728693822336, "flos": 25150828072320.0, "grad_norm": 2.0205668140023185, "language_loss": 0.8183766, "learning_rate": 3.981658998128341e-06, "loss": 0.84089589, "num_input_tokens_seen": 25477985, "step": 1195, "time_per_iteration": 2.6646647453308105 }, { "auxiliary_loss_clip": 0.01165951, "auxiliary_loss_mlp": 0.01063438, "balance_loss_clip": 1.0578239, "balance_loss_mlp": 1.03976321, "epoch": 0.07190741019089132, "flos": 22711237176960.0, "grad_norm": 2.161995064372768, "language_loss": 0.80093575, "learning_rate": 3.981606337229808e-06, "loss": 0.82322967, "num_input_tokens_seen": 25497110, "step": 1196, "time_per_iteration": 2.7217979431152344 }, { "auxiliary_loss_clip": 0.01176131, "auxiliary_loss_mlp": 0.00784114, "balance_loss_clip": 1.06106043, "balance_loss_mlp": 1.00034249, "epoch": 0.0719675334435593, "flos": 29349791032320.0, "grad_norm": 2.5905261146074263, "language_loss": 0.71339291, "learning_rate": 3.9815536011885655e-06, "loss": 0.73299539, "num_input_tokens_seen": 25516555, "step": 1197, "time_per_iteration": 2.7931766510009766 }, { "auxiliary_loss_clip": 0.01157444, "auxiliary_loss_mlp": 0.01055247, "balance_loss_clip": 1.06130266, "balance_loss_mlp": 1.03074968, "epoch": 0.07202765669622727, "flos": 17639788861440.0, "grad_norm": 3.074283933156949, "language_loss": 0.85951984, "learning_rate": 3.98150079000661e-06, "loss": 0.88164675, "num_input_tokens_seen": 25533895, "step": 1198, "time_per_iteration": 2.7241532802581787 }, { "auxiliary_loss_clip": 0.01160083, "auxiliary_loss_mlp": 0.0106501, "balance_loss_clip": 1.0597434, "balance_loss_mlp": 1.03944004, "epoch": 0.07208777994889523, "flos": 21434038306560.0, "grad_norm": 2.052617638295489, "language_loss": 0.83840948, "learning_rate": 3.981447903685947e-06, "loss": 0.86066043, "num_input_tokens_seen": 25554195, "step": 1199, "time_per_iteration": 2.71362566947937 }, { "auxiliary_loss_clip": 0.01212755, "auxiliary_loss_mlp": 0.01060557, "balance_loss_clip": 1.06877887, "balance_loss_mlp": 1.03709614, "epoch": 0.07214790320156321, "flos": 26940867373440.0, "grad_norm": 3.1601590133124837, "language_loss": 0.7623595, "learning_rate": 3.981394942228581e-06, "loss": 0.78509259, "num_input_tokens_seen": 25574155, "step": 1200, "time_per_iteration": 2.6913061141967773 }, { "auxiliary_loss_clip": 0.0119008, "auxiliary_loss_mlp": 0.010701, "balance_loss_clip": 1.06442261, "balance_loss_mlp": 1.04487491, "epoch": 0.07220802645423118, "flos": 23879949995520.0, "grad_norm": 2.2017873087036226, "language_loss": 0.83013475, "learning_rate": 3.98134190563652e-06, "loss": 0.85273659, "num_input_tokens_seen": 25592735, "step": 1201, "time_per_iteration": 2.6983115673065186 }, { "auxiliary_loss_clip": 0.01196941, "auxiliary_loss_mlp": 0.01065672, "balance_loss_clip": 1.06197119, "balance_loss_mlp": 1.03952968, "epoch": 0.07226814970689914, "flos": 19243631036160.0, "grad_norm": 20.835065187143087, "language_loss": 0.68601412, "learning_rate": 3.981288793911775e-06, "loss": 0.70864022, "num_input_tokens_seen": 25611510, "step": 1202, "time_per_iteration": 2.691742420196533 }, { "auxiliary_loss_clip": 0.01182684, "auxiliary_loss_mlp": 0.00782201, "balance_loss_clip": 1.06256962, "balance_loss_mlp": 1.00038218, "epoch": 0.07232827295956712, "flos": 19172025273600.0, "grad_norm": 1.9661831136137597, "language_loss": 0.87487721, "learning_rate": 3.98123560705636e-06, "loss": 0.89452606, "num_input_tokens_seen": 25629560, "step": 1203, "time_per_iteration": 2.7832019329071045 }, { "auxiliary_loss_clip": 0.01154778, "auxiliary_loss_mlp": 0.01065748, "balance_loss_clip": 1.05210066, "balance_loss_mlp": 1.04065442, "epoch": 0.07238839621223508, "flos": 17639752947840.0, "grad_norm": 1.731721557525142, "language_loss": 0.78053147, "learning_rate": 3.981182345072293e-06, "loss": 0.80273676, "num_input_tokens_seen": 25648330, "step": 1204, "time_per_iteration": 2.7754547595977783 }, { "auxiliary_loss_clip": 0.01191832, "auxiliary_loss_mlp": 0.01065794, "balance_loss_clip": 1.06211591, "balance_loss_mlp": 1.04084373, "epoch": 0.07244851946490305, "flos": 28292401440000.0, "grad_norm": 1.5043252978087258, "language_loss": 0.82094097, "learning_rate": 3.981129007961593e-06, "loss": 0.84351724, "num_input_tokens_seen": 25669470, "step": 1205, "time_per_iteration": 2.680457353591919 }, { "auxiliary_loss_clip": 0.01180244, "auxiliary_loss_mlp": 0.00782807, "balance_loss_clip": 1.06221068, "balance_loss_mlp": 1.00036049, "epoch": 0.07250864271757101, "flos": 22564829341440.0, "grad_norm": 1.6438962430217685, "language_loss": 0.76715982, "learning_rate": 3.981075595726283e-06, "loss": 0.78679025, "num_input_tokens_seen": 25690470, "step": 1206, "time_per_iteration": 2.7028439044952393 }, { "auxiliary_loss_clip": 0.01188223, "auxiliary_loss_mlp": 0.01059861, "balance_loss_clip": 1.06262684, "balance_loss_mlp": 1.03442228, "epoch": 0.072568765970239, "flos": 21762405463680.0, "grad_norm": 1.9378198243304647, "language_loss": 0.77272987, "learning_rate": 3.981022108368387e-06, "loss": 0.79521072, "num_input_tokens_seen": 25709205, "step": 1207, "time_per_iteration": 2.779289960861206 }, { "auxiliary_loss_clip": 0.01185538, "auxiliary_loss_mlp": 0.01053693, "balance_loss_clip": 1.05844951, "balance_loss_mlp": 1.03062558, "epoch": 0.07262888922290696, "flos": 25519702792320.0, "grad_norm": 1.8716528383816402, "language_loss": 0.79480875, "learning_rate": 3.9809685458899345e-06, "loss": 0.81720108, "num_input_tokens_seen": 25728485, "step": 1208, "time_per_iteration": 2.682965040206909 }, { "auxiliary_loss_clip": 0.01184899, "auxiliary_loss_mlp": 0.01054862, "balance_loss_clip": 1.05801737, "balance_loss_mlp": 1.03198612, "epoch": 0.07268901247557492, "flos": 21246548290560.0, "grad_norm": 2.5612886109689765, "language_loss": 0.78537548, "learning_rate": 3.980914908292955e-06, "loss": 0.80777311, "num_input_tokens_seen": 25747730, "step": 1209, "time_per_iteration": 2.6582658290863037 }, { "auxiliary_loss_clip": 0.01191905, "auxiliary_loss_mlp": 0.01067741, "balance_loss_clip": 1.05931175, "balance_loss_mlp": 1.04408956, "epoch": 0.0727491357282429, "flos": 25479302970240.0, "grad_norm": 2.351303434522043, "language_loss": 0.80920583, "learning_rate": 3.980861195579486e-06, "loss": 0.83180225, "num_input_tokens_seen": 25768050, "step": 1210, "time_per_iteration": 4.241993427276611 }, { "auxiliary_loss_clip": 0.0117493, "auxiliary_loss_mlp": 0.01063711, "balance_loss_clip": 1.06087565, "balance_loss_mlp": 1.03891551, "epoch": 0.07280925898091087, "flos": 24462169545600.0, "grad_norm": 1.875347829314158, "language_loss": 0.84302205, "learning_rate": 3.98080740775156e-06, "loss": 0.86540848, "num_input_tokens_seen": 25787985, "step": 1211, "time_per_iteration": 4.289919853210449 }, { "auxiliary_loss_clip": 0.01162055, "auxiliary_loss_mlp": 0.01060218, "balance_loss_clip": 1.05356658, "balance_loss_mlp": 1.03629231, "epoch": 0.07286938223357883, "flos": 18288191220480.0, "grad_norm": 2.991110515222773, "language_loss": 0.90684664, "learning_rate": 3.98075354481122e-06, "loss": 0.92906934, "num_input_tokens_seen": 25803620, "step": 1212, "time_per_iteration": 2.660780906677246 }, { "auxiliary_loss_clip": 0.01202443, "auxiliary_loss_mlp": 0.01058817, "balance_loss_clip": 1.0623759, "balance_loss_mlp": 1.03490353, "epoch": 0.07292950548624681, "flos": 21214803646080.0, "grad_norm": 1.7918815842724805, "language_loss": 0.72358596, "learning_rate": 3.9806996067605055e-06, "loss": 0.74619853, "num_input_tokens_seen": 25823315, "step": 1213, "time_per_iteration": 4.303524017333984 }, { "auxiliary_loss_clip": 0.01153662, "auxiliary_loss_mlp": 0.01055706, "balance_loss_clip": 1.05658662, "balance_loss_mlp": 1.03089869, "epoch": 0.07298962873891478, "flos": 24642009964800.0, "grad_norm": 1.8655932637344164, "language_loss": 0.84356117, "learning_rate": 3.980645593601465e-06, "loss": 0.86565483, "num_input_tokens_seen": 25842605, "step": 1214, "time_per_iteration": 2.7505569458007812 }, { "auxiliary_loss_clip": 0.01208881, "auxiliary_loss_mlp": 0.01062075, "balance_loss_clip": 1.06484771, "balance_loss_mlp": 1.03723145, "epoch": 0.07304975199158274, "flos": 27052765217280.0, "grad_norm": 2.025651344907852, "language_loss": 0.84113681, "learning_rate": 3.980591505336144e-06, "loss": 0.86384636, "num_input_tokens_seen": 25863030, "step": 1215, "time_per_iteration": 2.7235965728759766 }, { "auxiliary_loss_clip": 0.01149957, "auxiliary_loss_mlp": 0.01062992, "balance_loss_clip": 1.05138278, "balance_loss_mlp": 1.03744531, "epoch": 0.07310987524425071, "flos": 33549544091520.0, "grad_norm": 1.9312816725096997, "language_loss": 0.80926049, "learning_rate": 3.980537341966595e-06, "loss": 0.83139002, "num_input_tokens_seen": 25888015, "step": 1216, "time_per_iteration": 2.9129130840301514 }, { "auxiliary_loss_clip": 0.01167944, "auxiliary_loss_mlp": 0.01060276, "balance_loss_clip": 1.05619049, "balance_loss_mlp": 1.03680408, "epoch": 0.07316999849691869, "flos": 28110944908800.0, "grad_norm": 3.2846247291101975, "language_loss": 0.75949144, "learning_rate": 3.980483103494872e-06, "loss": 0.78177369, "num_input_tokens_seen": 25908660, "step": 1217, "time_per_iteration": 2.7106521129608154 }, { "auxiliary_loss_clip": 0.01169026, "auxiliary_loss_mlp": 0.01056631, "balance_loss_clip": 1.06182647, "balance_loss_mlp": 1.03477991, "epoch": 0.07323012174958665, "flos": 14392602529920.0, "grad_norm": 1.9658490798069863, "language_loss": 0.86455309, "learning_rate": 3.98042878992303e-06, "loss": 0.88680959, "num_input_tokens_seen": 25927215, "step": 1218, "time_per_iteration": 2.5911786556243896 }, { "auxiliary_loss_clip": 0.01192266, "auxiliary_loss_mlp": 0.0106258, "balance_loss_clip": 1.06015348, "balance_loss_mlp": 1.03916681, "epoch": 0.07329024500225462, "flos": 21616428591360.0, "grad_norm": 2.2310702082820675, "language_loss": 0.86782354, "learning_rate": 3.9803744012531305e-06, "loss": 0.89037204, "num_input_tokens_seen": 25945500, "step": 1219, "time_per_iteration": 2.608562707901001 }, { "auxiliary_loss_clip": 0.01201545, "auxiliary_loss_mlp": 0.01058282, "balance_loss_clip": 1.06024373, "balance_loss_mlp": 1.03539419, "epoch": 0.0733503682549226, "flos": 13224141106560.0, "grad_norm": 2.095886373367052, "language_loss": 0.84608674, "learning_rate": 3.980319937487235e-06, "loss": 0.86868501, "num_input_tokens_seen": 25963105, "step": 1220, "time_per_iteration": 2.469189405441284 }, { "auxiliary_loss_clip": 0.01158855, "auxiliary_loss_mlp": 0.01063399, "balance_loss_clip": 1.05358922, "balance_loss_mlp": 1.03942597, "epoch": 0.07341049150759056, "flos": 20886975192960.0, "grad_norm": 2.648884311755534, "language_loss": 0.77114344, "learning_rate": 3.98026539862741e-06, "loss": 0.79336596, "num_input_tokens_seen": 25981690, "step": 1221, "time_per_iteration": 2.671762466430664 }, { "auxiliary_loss_clip": 0.01158201, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.05726743, "balance_loss_mlp": 1.04082406, "epoch": 0.07347061476025853, "flos": 15413614623360.0, "grad_norm": 2.5357389392469942, "language_loss": 0.91631913, "learning_rate": 3.980210784675722e-06, "loss": 0.93855029, "num_input_tokens_seen": 25999890, "step": 1222, "time_per_iteration": 2.6973063945770264 }, { "auxiliary_loss_clip": 0.01135907, "auxiliary_loss_mlp": 0.01064872, "balance_loss_clip": 1.05333126, "balance_loss_mlp": 1.04169726, "epoch": 0.0735307380129265, "flos": 11108859131520.0, "grad_norm": 2.8024324299253047, "language_loss": 0.90976465, "learning_rate": 3.980156095634242e-06, "loss": 0.93177247, "num_input_tokens_seen": 26016445, "step": 1223, "time_per_iteration": 2.8141093254089355 }, { "auxiliary_loss_clip": 0.01202875, "auxiliary_loss_mlp": 0.01077185, "balance_loss_clip": 1.06232905, "balance_loss_mlp": 1.05341494, "epoch": 0.07359086126559447, "flos": 23732392924800.0, "grad_norm": 1.9348534518871447, "language_loss": 0.82161939, "learning_rate": 3.980101331505045e-06, "loss": 0.84442002, "num_input_tokens_seen": 26036080, "step": 1224, "time_per_iteration": 2.640432119369507 }, { "auxiliary_loss_clip": 0.01200329, "auxiliary_loss_mlp": 0.01057586, "balance_loss_clip": 1.05987597, "balance_loss_mlp": 1.03229022, "epoch": 0.07365098451826244, "flos": 20993270515200.0, "grad_norm": 2.31744406237409, "language_loss": 0.83194047, "learning_rate": 3.9800464922902076e-06, "loss": 0.85451961, "num_input_tokens_seen": 26055805, "step": 1225, "time_per_iteration": 2.6159210205078125 }, { "auxiliary_loss_clip": 0.01170115, "auxiliary_loss_mlp": 0.01056068, "balance_loss_clip": 1.05743551, "balance_loss_mlp": 1.03190422, "epoch": 0.0737111077709304, "flos": 19933582452480.0, "grad_norm": 2.2959030425986544, "language_loss": 0.90388274, "learning_rate": 3.979991577991808e-06, "loss": 0.9261446, "num_input_tokens_seen": 26073905, "step": 1226, "time_per_iteration": 2.6527435779571533 }, { "auxiliary_loss_clip": 0.01207799, "auxiliary_loss_mlp": 0.0104599, "balance_loss_clip": 1.05913424, "balance_loss_mlp": 1.02080154, "epoch": 0.07377123102359838, "flos": 16581537342720.0, "grad_norm": 2.579592162134606, "language_loss": 0.76626784, "learning_rate": 3.97993658861193e-06, "loss": 0.78880572, "num_input_tokens_seen": 26091700, "step": 1227, "time_per_iteration": 2.596151351928711 }, { "auxiliary_loss_clip": 0.0118909, "auxiliary_loss_mlp": 0.01053386, "balance_loss_clip": 1.06296694, "balance_loss_mlp": 1.02954459, "epoch": 0.07383135427626634, "flos": 28328563457280.0, "grad_norm": 7.788838200212175, "language_loss": 0.8555491, "learning_rate": 3.9798815241526575e-06, "loss": 0.87797379, "num_input_tokens_seen": 26114105, "step": 1228, "time_per_iteration": 2.6955716609954834 }, { "auxiliary_loss_clip": 0.01191175, "auxiliary_loss_mlp": 0.01062669, "balance_loss_clip": 1.05897212, "balance_loss_mlp": 1.03860044, "epoch": 0.07389147752893431, "flos": 20047168235520.0, "grad_norm": 2.2575099517148898, "language_loss": 0.79598552, "learning_rate": 3.97982638461608e-06, "loss": 0.818524, "num_input_tokens_seen": 26131165, "step": 1229, "time_per_iteration": 2.6544861793518066 }, { "auxiliary_loss_clip": 0.01192886, "auxiliary_loss_mlp": 0.00782044, "balance_loss_clip": 1.05966699, "balance_loss_mlp": 1.00032902, "epoch": 0.07395160078160229, "flos": 18114132890880.0, "grad_norm": 2.2881874382496377, "language_loss": 0.78209347, "learning_rate": 3.979771170004287e-06, "loss": 0.80184281, "num_input_tokens_seen": 26150040, "step": 1230, "time_per_iteration": 2.6001133918762207 }, { "auxiliary_loss_clip": 0.0120142, "auxiliary_loss_mlp": 0.01052342, "balance_loss_clip": 1.06209648, "balance_loss_mlp": 1.02739108, "epoch": 0.07401172403427025, "flos": 23586918842880.0, "grad_norm": 2.038847041772147, "language_loss": 0.8136946, "learning_rate": 3.979715880319372e-06, "loss": 0.83623219, "num_input_tokens_seen": 26169380, "step": 1231, "time_per_iteration": 2.6364073753356934 }, { "auxiliary_loss_clip": 0.01179975, "auxiliary_loss_mlp": 0.01070917, "balance_loss_clip": 1.05690873, "balance_loss_mlp": 1.04599047, "epoch": 0.07407184728693822, "flos": 26359904799360.0, "grad_norm": 2.096832924731062, "language_loss": 0.95204866, "learning_rate": 3.979660515563434e-06, "loss": 0.97455758, "num_input_tokens_seen": 26189420, "step": 1232, "time_per_iteration": 2.7929203510284424 }, { "auxiliary_loss_clip": 0.01187282, "auxiliary_loss_mlp": 0.01059661, "balance_loss_clip": 1.06202245, "balance_loss_mlp": 1.03733301, "epoch": 0.0741319705396062, "flos": 22200443821440.0, "grad_norm": 1.7778448126368063, "language_loss": 0.80695188, "learning_rate": 3.979605075738569e-06, "loss": 0.82942128, "num_input_tokens_seen": 26209300, "step": 1233, "time_per_iteration": 2.7945051193237305 }, { "auxiliary_loss_clip": 0.01209245, "auxiliary_loss_mlp": 0.0106207, "balance_loss_clip": 1.06238747, "balance_loss_mlp": 1.03602231, "epoch": 0.07419209379227416, "flos": 39200482523520.0, "grad_norm": 2.136728864247421, "language_loss": 0.70708907, "learning_rate": 3.979549560846883e-06, "loss": 0.72980225, "num_input_tokens_seen": 26228110, "step": 1234, "time_per_iteration": 2.9646782875061035 }, { "auxiliary_loss_clip": 0.01167486, "auxiliary_loss_mlp": 0.01068879, "balance_loss_clip": 1.0542618, "balance_loss_mlp": 1.04265285, "epoch": 0.07425221704494213, "flos": 22781657790720.0, "grad_norm": 1.7921102377369336, "language_loss": 0.76852918, "learning_rate": 3.979493970890478e-06, "loss": 0.79089284, "num_input_tokens_seen": 26247020, "step": 1235, "time_per_iteration": 2.820577621459961 }, { "auxiliary_loss_clip": 0.01198028, "auxiliary_loss_mlp": 0.01055883, "balance_loss_clip": 1.05918813, "balance_loss_mlp": 1.0321244, "epoch": 0.0743123402976101, "flos": 22272983337600.0, "grad_norm": 2.3018318065058097, "language_loss": 0.82748145, "learning_rate": 3.979438305871464e-06, "loss": 0.85002053, "num_input_tokens_seen": 26265750, "step": 1236, "time_per_iteration": 2.6302287578582764 }, { "auxiliary_loss_clip": 0.01154783, "auxiliary_loss_mlp": 0.00782014, "balance_loss_clip": 1.05519629, "balance_loss_mlp": 1.00039148, "epoch": 0.07437246355027807, "flos": 29315029645440.0, "grad_norm": 1.7985383717833268, "language_loss": 0.7595011, "learning_rate": 3.979382565791951e-06, "loss": 0.77886909, "num_input_tokens_seen": 26287905, "step": 1237, "time_per_iteration": 2.721931219100952 }, { "auxiliary_loss_clip": 0.01135551, "auxiliary_loss_mlp": 0.00783311, "balance_loss_clip": 1.0505693, "balance_loss_mlp": 1.00031757, "epoch": 0.07443258680294604, "flos": 31944732249600.0, "grad_norm": 1.6915170784810407, "language_loss": 0.77458763, "learning_rate": 3.979326750654053e-06, "loss": 0.79377621, "num_input_tokens_seen": 26311795, "step": 1238, "time_per_iteration": 2.831620931625366 }, { "auxiliary_loss_clip": 0.01177529, "auxiliary_loss_mlp": 0.01057762, "balance_loss_clip": 1.05673254, "balance_loss_mlp": 1.03311002, "epoch": 0.074492710055614, "flos": 22675290641280.0, "grad_norm": 1.9053364150897723, "language_loss": 0.867737, "learning_rate": 3.9792708604598854e-06, "loss": 0.89008987, "num_input_tokens_seen": 26330330, "step": 1239, "time_per_iteration": 2.6697263717651367 }, { "auxiliary_loss_clip": 0.01159844, "auxiliary_loss_mlp": 0.01050954, "balance_loss_clip": 1.05222142, "balance_loss_mlp": 1.02532458, "epoch": 0.07455283330828198, "flos": 21284901037440.0, "grad_norm": 26.978042105238785, "language_loss": 0.89356089, "learning_rate": 3.979214895211569e-06, "loss": 0.91566885, "num_input_tokens_seen": 26348865, "step": 1240, "time_per_iteration": 2.846013069152832 }, { "auxiliary_loss_clip": 0.01174117, "auxiliary_loss_mlp": 0.01063539, "balance_loss_clip": 1.05857158, "balance_loss_mlp": 1.03713393, "epoch": 0.07461295656094995, "flos": 24388408967040.0, "grad_norm": 1.9346624045484253, "language_loss": 0.88873678, "learning_rate": 3.979158854911225e-06, "loss": 0.91111326, "num_input_tokens_seen": 26368210, "step": 1241, "time_per_iteration": 2.6926562786102295 }, { "auxiliary_loss_clip": 0.01079637, "auxiliary_loss_mlp": 0.01009562, "balance_loss_clip": 1.03489435, "balance_loss_mlp": 1.00405502, "epoch": 0.07467307981361791, "flos": 62109660574080.0, "grad_norm": 0.8973011136706247, "language_loss": 0.63067901, "learning_rate": 3.979102739560979e-06, "loss": 0.65157104, "num_input_tokens_seen": 26424890, "step": 1242, "time_per_iteration": 3.298609972000122 }, { "auxiliary_loss_clip": 0.01164269, "auxiliary_loss_mlp": 0.01068833, "balance_loss_clip": 1.05246222, "balance_loss_mlp": 1.03819644, "epoch": 0.07473320306628589, "flos": 24863148046080.0, "grad_norm": 3.87499965477456, "language_loss": 0.62926078, "learning_rate": 3.9790465491629595e-06, "loss": 0.65159178, "num_input_tokens_seen": 26446405, "step": 1243, "time_per_iteration": 2.7774572372436523 }, { "auxiliary_loss_clip": 0.01188864, "auxiliary_loss_mlp": 0.01059918, "balance_loss_clip": 1.05716145, "balance_loss_mlp": 1.03499091, "epoch": 0.07479332631895386, "flos": 24897442556160.0, "grad_norm": 1.6252135866538246, "language_loss": 0.76259589, "learning_rate": 3.978990283719296e-06, "loss": 0.78508377, "num_input_tokens_seen": 26466070, "step": 1244, "time_per_iteration": 2.714459180831909 }, { "auxiliary_loss_clip": 0.01184345, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.0611167, "balance_loss_mlp": 1.00038469, "epoch": 0.07485344957162182, "flos": 17815247821440.0, "grad_norm": 5.636002853507256, "language_loss": 0.69419599, "learning_rate": 3.978933943232123e-06, "loss": 0.71387023, "num_input_tokens_seen": 26479350, "step": 1245, "time_per_iteration": 2.640895366668701 }, { "auxiliary_loss_clip": 0.01203955, "auxiliary_loss_mlp": 0.01062684, "balance_loss_clip": 1.06098139, "balance_loss_mlp": 1.0372088, "epoch": 0.0749135728242898, "flos": 25010202326400.0, "grad_norm": 2.5525245798098757, "language_loss": 0.88635457, "learning_rate": 3.978877527703576e-06, "loss": 0.90902102, "num_input_tokens_seen": 26498255, "step": 1246, "time_per_iteration": 2.747765302658081 }, { "auxiliary_loss_clip": 0.01212369, "auxiliary_loss_mlp": 0.01077452, "balance_loss_clip": 1.06102896, "balance_loss_mlp": 1.049402, "epoch": 0.07497369607695777, "flos": 17822071405440.0, "grad_norm": 2.675073323546491, "language_loss": 0.8825295, "learning_rate": 3.9788210371357945e-06, "loss": 0.90542769, "num_input_tokens_seen": 26515375, "step": 1247, "time_per_iteration": 2.6810224056243896 }, { "auxiliary_loss_clip": 0.0118495, "auxiliary_loss_mlp": 0.01069489, "balance_loss_clip": 1.06058884, "balance_loss_mlp": 1.04383492, "epoch": 0.07503381932962573, "flos": 15121086261120.0, "grad_norm": 2.620559853720615, "language_loss": 0.64849806, "learning_rate": 3.978764471530921e-06, "loss": 0.67104244, "num_input_tokens_seen": 26533595, "step": 1248, "time_per_iteration": 2.706862449645996 }, { "auxiliary_loss_clip": 0.01181878, "auxiliary_loss_mlp": 0.00782677, "balance_loss_clip": 1.0575974, "balance_loss_mlp": 1.0004611, "epoch": 0.0750939425822937, "flos": 12816734071680.0, "grad_norm": 2.872208543000993, "language_loss": 0.74216163, "learning_rate": 3.978707830891102e-06, "loss": 0.7618072, "num_input_tokens_seen": 26549405, "step": 1249, "time_per_iteration": 4.309665679931641 }, { "auxiliary_loss_clip": 0.01168375, "auxiliary_loss_mlp": 0.01079691, "balance_loss_clip": 1.0579834, "balance_loss_mlp": 1.05296445, "epoch": 0.07515406583496168, "flos": 24206844695040.0, "grad_norm": 2.679176110316805, "language_loss": 0.82353318, "learning_rate": 3.978651115218482e-06, "loss": 0.84601378, "num_input_tokens_seen": 26567200, "step": 1250, "time_per_iteration": 4.367432594299316 }, { "auxiliary_loss_clip": 0.011507, "auxiliary_loss_mlp": 0.01064103, "balance_loss_clip": 1.05736125, "balance_loss_mlp": 1.0380677, "epoch": 0.07521418908762964, "flos": 26688164215680.0, "grad_norm": 2.015636709873133, "language_loss": 0.6679548, "learning_rate": 3.978594324515215e-06, "loss": 0.69010288, "num_input_tokens_seen": 26586190, "step": 1251, "time_per_iteration": 4.339111089706421 }, { "auxiliary_loss_clip": 0.01061099, "auxiliary_loss_mlp": 0.01007289, "balance_loss_clip": 1.02992618, "balance_loss_mlp": 1.00314093, "epoch": 0.0752743123402976, "flos": 59095140589440.0, "grad_norm": 0.9014655793512963, "language_loss": 0.7038399, "learning_rate": 3.9785374587834515e-06, "loss": 0.72452378, "num_input_tokens_seen": 26650710, "step": 1252, "time_per_iteration": 4.984445333480835 }, { "auxiliary_loss_clip": 0.0120348, "auxiliary_loss_mlp": 0.01071343, "balance_loss_clip": 1.06016684, "balance_loss_mlp": 1.04651129, "epoch": 0.07533443559296558, "flos": 23477032160640.0, "grad_norm": 2.2789224049077226, "language_loss": 0.79936707, "learning_rate": 3.97848051802535e-06, "loss": 0.82211524, "num_input_tokens_seen": 26669000, "step": 1253, "time_per_iteration": 2.613696575164795 }, { "auxiliary_loss_clip": 0.01165402, "auxiliary_loss_mlp": 0.01062493, "balance_loss_clip": 1.05703712, "balance_loss_mlp": 1.03758967, "epoch": 0.07539455884563355, "flos": 20879110114560.0, "grad_norm": 3.1057458778243263, "language_loss": 0.93360364, "learning_rate": 3.978423502243069e-06, "loss": 0.95588255, "num_input_tokens_seen": 26683075, "step": 1254, "time_per_iteration": 2.7332606315612793 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01064454, "balance_loss_clip": 1.06050682, "balance_loss_mlp": 1.03958726, "epoch": 0.07545468209830151, "flos": 27672906551040.0, "grad_norm": 2.090631066181037, "language_loss": 0.88087487, "learning_rate": 3.97836641143877e-06, "loss": 0.90325236, "num_input_tokens_seen": 26701875, "step": 1255, "time_per_iteration": 2.713636875152588 }, { "auxiliary_loss_clip": 0.01202338, "auxiliary_loss_mlp": 0.01071467, "balance_loss_clip": 1.06138325, "balance_loss_mlp": 1.04531264, "epoch": 0.0755148053509695, "flos": 14136990370560.0, "grad_norm": 1.9772348994273161, "language_loss": 0.79305708, "learning_rate": 3.978309245614618e-06, "loss": 0.81579506, "num_input_tokens_seen": 26719050, "step": 1256, "time_per_iteration": 2.688812255859375 }, { "auxiliary_loss_clip": 0.01064506, "auxiliary_loss_mlp": 0.01008663, "balance_loss_clip": 1.0281384, "balance_loss_mlp": 1.0043, "epoch": 0.07557492860363746, "flos": 58235257929600.0, "grad_norm": 0.7721513084275832, "language_loss": 0.58031851, "learning_rate": 3.9782520047727825e-06, "loss": 0.6010502, "num_input_tokens_seen": 26780650, "step": 1257, "time_per_iteration": 3.290971517562866 }, { "auxiliary_loss_clip": 0.01154091, "auxiliary_loss_mlp": 0.01065293, "balance_loss_clip": 1.06175375, "balance_loss_mlp": 1.04035461, "epoch": 0.07563505185630542, "flos": 24644380262400.0, "grad_norm": 2.5700283098608026, "language_loss": 0.90029764, "learning_rate": 3.978194688915432e-06, "loss": 0.92249143, "num_input_tokens_seen": 26798725, "step": 1258, "time_per_iteration": 2.800297975540161 }, { "auxiliary_loss_clip": 0.01169581, "auxiliary_loss_mlp": 0.01064585, "balance_loss_clip": 1.06184185, "balance_loss_mlp": 1.03797793, "epoch": 0.07569517510897339, "flos": 15522998515200.0, "grad_norm": 2.1868972302346377, "language_loss": 0.81404132, "learning_rate": 3.978137298044741e-06, "loss": 0.83638299, "num_input_tokens_seen": 26817005, "step": 1259, "time_per_iteration": 2.767717123031616 }, { "auxiliary_loss_clip": 0.01194891, "auxiliary_loss_mlp": 0.01062022, "balance_loss_clip": 1.06317782, "balance_loss_mlp": 1.03766739, "epoch": 0.07575529836164137, "flos": 22928532503040.0, "grad_norm": 1.8876128491153832, "language_loss": 0.7609086, "learning_rate": 3.978079832162885e-06, "loss": 0.78347778, "num_input_tokens_seen": 26836655, "step": 1260, "time_per_iteration": 2.859339714050293 }, { "auxiliary_loss_clip": 0.01160098, "auxiliary_loss_mlp": 0.01068568, "balance_loss_clip": 1.05432057, "balance_loss_mlp": 1.04222322, "epoch": 0.07581542161430933, "flos": 19500428344320.0, "grad_norm": 1.7028037437197219, "language_loss": 0.84734851, "learning_rate": 3.978022291272044e-06, "loss": 0.86963522, "num_input_tokens_seen": 26854925, "step": 1261, "time_per_iteration": 2.773087978363037 }, { "auxiliary_loss_clip": 0.01212087, "auxiliary_loss_mlp": 0.0106726, "balance_loss_clip": 1.06821966, "balance_loss_mlp": 1.04273915, "epoch": 0.0758755448669773, "flos": 24973465691520.0, "grad_norm": 1.8668314773439494, "language_loss": 0.82578814, "learning_rate": 3.977964675374399e-06, "loss": 0.84858155, "num_input_tokens_seen": 26876170, "step": 1262, "time_per_iteration": 2.681764841079712 }, { "auxiliary_loss_clip": 0.01206367, "auxiliary_loss_mlp": 0.0106285, "balance_loss_clip": 1.06333947, "balance_loss_mlp": 1.03685009, "epoch": 0.07593566811964528, "flos": 22747973811840.0, "grad_norm": 2.501362251414687, "language_loss": 0.82448232, "learning_rate": 3.977906984472136e-06, "loss": 0.84717447, "num_input_tokens_seen": 26895005, "step": 1263, "time_per_iteration": 2.6262786388397217 }, { "auxiliary_loss_clip": 0.01166059, "auxiliary_loss_mlp": 0.01068738, "balance_loss_clip": 1.06484997, "balance_loss_mlp": 1.04334641, "epoch": 0.07599579137231324, "flos": 23112395245440.0, "grad_norm": 2.171520639750579, "language_loss": 0.76149648, "learning_rate": 3.977849218567442e-06, "loss": 0.78384447, "num_input_tokens_seen": 26913930, "step": 1264, "time_per_iteration": 2.7735466957092285 }, { "auxiliary_loss_clip": 0.01181777, "auxiliary_loss_mlp": 0.01061673, "balance_loss_clip": 1.06183577, "balance_loss_mlp": 1.03704381, "epoch": 0.07605591462498121, "flos": 14502058248960.0, "grad_norm": 2.252731793921747, "language_loss": 0.80919051, "learning_rate": 3.977791377662507e-06, "loss": 0.83162498, "num_input_tokens_seen": 26931485, "step": 1265, "time_per_iteration": 2.6076793670654297 }, { "auxiliary_loss_clip": 0.01143593, "auxiliary_loss_mlp": 0.01068856, "balance_loss_clip": 1.05383801, "balance_loss_mlp": 1.0411638, "epoch": 0.07611603787764919, "flos": 23514199758720.0, "grad_norm": 2.117217065332582, "language_loss": 0.65244937, "learning_rate": 3.977733461759524e-06, "loss": 0.67457378, "num_input_tokens_seen": 26951670, "step": 1266, "time_per_iteration": 2.714848041534424 }, { "auxiliary_loss_clip": 0.0116364, "auxiliary_loss_mlp": 0.01066982, "balance_loss_clip": 1.05869627, "balance_loss_mlp": 1.04194832, "epoch": 0.07617616113031715, "flos": 21507188353920.0, "grad_norm": 2.0157381540709416, "language_loss": 0.79570109, "learning_rate": 3.977675470860691e-06, "loss": 0.81800735, "num_input_tokens_seen": 26970335, "step": 1267, "time_per_iteration": 2.692220687866211 }, { "auxiliary_loss_clip": 0.01186526, "auxiliary_loss_mlp": 0.01060572, "balance_loss_clip": 1.06368709, "balance_loss_mlp": 1.03644359, "epoch": 0.07623628438298512, "flos": 14573161221120.0, "grad_norm": 2.573855585409162, "language_loss": 0.72936547, "learning_rate": 3.977617404968205e-06, "loss": 0.75183642, "num_input_tokens_seen": 26986025, "step": 1268, "time_per_iteration": 2.666487216949463 }, { "auxiliary_loss_clip": 0.01189272, "auxiliary_loss_mlp": 0.01056943, "balance_loss_clip": 1.05925119, "balance_loss_mlp": 1.03146791, "epoch": 0.07629640763565308, "flos": 14720395069440.0, "grad_norm": 2.3531002902867018, "language_loss": 0.82087409, "learning_rate": 3.977559264084269e-06, "loss": 0.84333622, "num_input_tokens_seen": 27004045, "step": 1269, "time_per_iteration": 2.6196024417877197 }, { "auxiliary_loss_clip": 0.01198264, "auxiliary_loss_mlp": 0.01062408, "balance_loss_clip": 1.06528163, "balance_loss_mlp": 1.03656352, "epoch": 0.07635653088832106, "flos": 14902929008640.0, "grad_norm": 2.6660741307472424, "language_loss": 0.88614184, "learning_rate": 3.977501048211088e-06, "loss": 0.90874851, "num_input_tokens_seen": 27022070, "step": 1270, "time_per_iteration": 2.6423919200897217 }, { "auxiliary_loss_clip": 0.01195764, "auxiliary_loss_mlp": 0.01062092, "balance_loss_clip": 1.06443572, "balance_loss_mlp": 1.0371294, "epoch": 0.07641665414098903, "flos": 26651571235200.0, "grad_norm": 2.486841045046768, "language_loss": 0.7104162, "learning_rate": 3.977442757350869e-06, "loss": 0.73299474, "num_input_tokens_seen": 27041755, "step": 1271, "time_per_iteration": 2.6679437160491943 }, { "auxiliary_loss_clip": 0.01157818, "auxiliary_loss_mlp": 0.01068131, "balance_loss_clip": 1.05973268, "balance_loss_mlp": 1.04282308, "epoch": 0.07647677739365699, "flos": 25192808092800.0, "grad_norm": 1.5691807400142836, "language_loss": 0.82570392, "learning_rate": 3.977384391505823e-06, "loss": 0.84796339, "num_input_tokens_seen": 27061540, "step": 1272, "time_per_iteration": 2.7613680362701416 }, { "auxiliary_loss_clip": 0.01176176, "auxiliary_loss_mlp": 0.00782751, "balance_loss_clip": 1.05822372, "balance_loss_mlp": 1.00051665, "epoch": 0.07653690064632497, "flos": 20558141159040.0, "grad_norm": 1.811509476700225, "language_loss": 0.79854733, "learning_rate": 3.977325950678162e-06, "loss": 0.81813657, "num_input_tokens_seen": 27081395, "step": 1273, "time_per_iteration": 2.696317434310913 }, { "auxiliary_loss_clip": 0.01185133, "auxiliary_loss_mlp": 0.01064308, "balance_loss_clip": 1.06556833, "balance_loss_mlp": 1.03910685, "epoch": 0.07659702389899294, "flos": 22269320150400.0, "grad_norm": 1.7399681078894738, "language_loss": 0.81519866, "learning_rate": 3.977267434870103e-06, "loss": 0.83769304, "num_input_tokens_seen": 27101175, "step": 1274, "time_per_iteration": 2.8570950031280518 }, { "auxiliary_loss_clip": 0.0118748, "auxiliary_loss_mlp": 0.01078696, "balance_loss_clip": 1.06516898, "balance_loss_mlp": 1.05164731, "epoch": 0.0766571471516609, "flos": 32636120209920.0, "grad_norm": 2.6845981005996453, "language_loss": 0.73083639, "learning_rate": 3.977208844083865e-06, "loss": 0.75349814, "num_input_tokens_seen": 27124505, "step": 1275, "time_per_iteration": 2.75947904586792 }, { "auxiliary_loss_clip": 0.0121081, "auxiliary_loss_mlp": 0.01063745, "balance_loss_clip": 1.06740415, "balance_loss_mlp": 1.03694642, "epoch": 0.07671727040432888, "flos": 15267386355840.0, "grad_norm": 2.828157953752124, "language_loss": 0.79507053, "learning_rate": 3.9771501783216685e-06, "loss": 0.81781602, "num_input_tokens_seen": 27140960, "step": 1276, "time_per_iteration": 2.626683473587036 }, { "auxiliary_loss_clip": 0.01198279, "auxiliary_loss_mlp": 0.01058719, "balance_loss_clip": 1.06486118, "balance_loss_mlp": 1.03485298, "epoch": 0.07677739365699685, "flos": 28184094956160.0, "grad_norm": 2.406514987231471, "language_loss": 0.58915478, "learning_rate": 3.97709143758574e-06, "loss": 0.61172473, "num_input_tokens_seen": 27160985, "step": 1277, "time_per_iteration": 2.6684958934783936 }, { "auxiliary_loss_clip": 0.01201282, "auxiliary_loss_mlp": 0.01064396, "balance_loss_clip": 1.06430948, "balance_loss_mlp": 1.03919542, "epoch": 0.07683751690966481, "flos": 18296128126080.0, "grad_norm": 2.8024245322836046, "language_loss": 0.74957907, "learning_rate": 3.977032621878305e-06, "loss": 0.77223587, "num_input_tokens_seen": 27178390, "step": 1278, "time_per_iteration": 2.723675012588501 }, { "auxiliary_loss_clip": 0.01160972, "auxiliary_loss_mlp": 0.01063133, "balance_loss_clip": 1.0584681, "balance_loss_mlp": 1.0390408, "epoch": 0.07689764016233278, "flos": 21981101420160.0, "grad_norm": 5.339853944094037, "language_loss": 0.88594604, "learning_rate": 3.976973731201596e-06, "loss": 0.90818715, "num_input_tokens_seen": 27197505, "step": 1279, "time_per_iteration": 2.655036211013794 }, { "auxiliary_loss_clip": 0.01172627, "auxiliary_loss_mlp": 0.01066586, "balance_loss_clip": 1.06065845, "balance_loss_mlp": 1.04077685, "epoch": 0.07695776341500075, "flos": 22235995307520.0, "grad_norm": 2.4937131241937256, "language_loss": 0.8300451, "learning_rate": 3.976914765557845e-06, "loss": 0.85243726, "num_input_tokens_seen": 27214260, "step": 1280, "time_per_iteration": 2.7717065811157227 }, { "auxiliary_loss_clip": 0.01194022, "auxiliary_loss_mlp": 0.01066533, "balance_loss_clip": 1.06593037, "balance_loss_mlp": 1.04104638, "epoch": 0.07701788666766872, "flos": 16143750380160.0, "grad_norm": 2.044864943195716, "language_loss": 0.7581439, "learning_rate": 3.9768557249492875e-06, "loss": 0.78074944, "num_input_tokens_seen": 27232525, "step": 1281, "time_per_iteration": 2.7444865703582764 }, { "auxiliary_loss_clip": 0.01170775, "auxiliary_loss_mlp": 0.01062526, "balance_loss_clip": 1.05879402, "balance_loss_mlp": 1.03669322, "epoch": 0.07707800992033668, "flos": 19463045264640.0, "grad_norm": 1.8925477349429178, "language_loss": 0.75091648, "learning_rate": 3.9767966093781634e-06, "loss": 0.77324951, "num_input_tokens_seen": 27249800, "step": 1282, "time_per_iteration": 2.829145908355713 }, { "auxiliary_loss_clip": 0.01213222, "auxiliary_loss_mlp": 0.01071082, "balance_loss_clip": 1.07007408, "balance_loss_mlp": 1.04549992, "epoch": 0.07713813317300466, "flos": 18990281433600.0, "grad_norm": 2.1558853998977527, "language_loss": 0.83863324, "learning_rate": 3.976737418846713e-06, "loss": 0.8614763, "num_input_tokens_seen": 27268895, "step": 1283, "time_per_iteration": 2.6955173015594482 }, { "auxiliary_loss_clip": 0.0119621, "auxiliary_loss_mlp": 0.01066889, "balance_loss_clip": 1.06603825, "balance_loss_mlp": 1.03925657, "epoch": 0.07719825642567263, "flos": 18113953322880.0, "grad_norm": 2.520477290704422, "language_loss": 0.75147104, "learning_rate": 3.976678153357181e-06, "loss": 0.77410209, "num_input_tokens_seen": 27288180, "step": 1284, "time_per_iteration": 2.6589291095733643 }, { "auxiliary_loss_clip": 0.01182212, "auxiliary_loss_mlp": 0.01068485, "balance_loss_clip": 1.06304765, "balance_loss_mlp": 1.0438329, "epoch": 0.0772583796783406, "flos": 42194426993280.0, "grad_norm": 5.2953301239297295, "language_loss": 0.76224041, "learning_rate": 3.976618812911817e-06, "loss": 0.78474742, "num_input_tokens_seen": 27311815, "step": 1285, "time_per_iteration": 2.847702741622925 }, { "auxiliary_loss_clip": 0.01216302, "auxiliary_loss_mlp": 0.01071451, "balance_loss_clip": 1.07193899, "balance_loss_mlp": 1.04729891, "epoch": 0.07731850293100857, "flos": 24753692327040.0, "grad_norm": 2.0564733507641, "language_loss": 0.84193194, "learning_rate": 3.9765593975128685e-06, "loss": 0.86480945, "num_input_tokens_seen": 27331890, "step": 1286, "time_per_iteration": 2.713963270187378 }, { "auxiliary_loss_clip": 0.01180469, "auxiliary_loss_mlp": 0.01061062, "balance_loss_clip": 1.06331325, "balance_loss_mlp": 1.03646958, "epoch": 0.07737862618367654, "flos": 17565884628480.0, "grad_norm": 2.810253293244863, "language_loss": 0.76899689, "learning_rate": 3.97649990716259e-06, "loss": 0.79141217, "num_input_tokens_seen": 27348320, "step": 1287, "time_per_iteration": 2.669168472290039 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01061108, "balance_loss_clip": 1.05891848, "balance_loss_mlp": 1.03696775, "epoch": 0.0774387494363445, "flos": 25627147349760.0, "grad_norm": 1.6525652726351308, "language_loss": 0.84699571, "learning_rate": 3.976440341863237e-06, "loss": 0.86936986, "num_input_tokens_seen": 27367670, "step": 1288, "time_per_iteration": 2.7794599533081055 }, { "auxiliary_loss_clip": 0.01206182, "auxiliary_loss_mlp": 0.0106604, "balance_loss_clip": 1.06214797, "balance_loss_mlp": 1.04203176, "epoch": 0.07749887268901248, "flos": 12239865648000.0, "grad_norm": 2.0424090794957523, "language_loss": 0.85576034, "learning_rate": 3.976380701617068e-06, "loss": 0.87848258, "num_input_tokens_seen": 27385485, "step": 1289, "time_per_iteration": 4.232934236526489 }, { "auxiliary_loss_clip": 0.01207527, "auxiliary_loss_mlp": 0.01052975, "balance_loss_clip": 1.06487668, "balance_loss_mlp": 1.0291574, "epoch": 0.07755899594168045, "flos": 25081736261760.0, "grad_norm": 2.840721047922519, "language_loss": 0.85548425, "learning_rate": 3.976320986426344e-06, "loss": 0.87808931, "num_input_tokens_seen": 27405110, "step": 1290, "time_per_iteration": 4.218302965164185 }, { "auxiliary_loss_clip": 0.0117374, "auxiliary_loss_mlp": 0.01066698, "balance_loss_clip": 1.06411862, "balance_loss_mlp": 1.04041266, "epoch": 0.07761911919434841, "flos": 14246410176000.0, "grad_norm": 2.3756178078405976, "language_loss": 0.91390574, "learning_rate": 3.9762611962933315e-06, "loss": 0.93631011, "num_input_tokens_seen": 27422855, "step": 1291, "time_per_iteration": 4.468304395675659 }, { "auxiliary_loss_clip": 0.01081301, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.04092944, "balance_loss_mlp": 1.03894901, "epoch": 0.07767924244701638, "flos": 67237202954880.0, "grad_norm": 0.8973948861970446, "language_loss": 0.65065891, "learning_rate": 3.9762013312202955e-06, "loss": 0.67190224, "num_input_tokens_seen": 27487190, "step": 1292, "time_per_iteration": 3.3142755031585693 }, { "auxiliary_loss_clip": 0.01195822, "auxiliary_loss_mlp": 0.01062751, "balance_loss_clip": 1.06527543, "balance_loss_mlp": 1.03846776, "epoch": 0.07773936569968436, "flos": 28550635292160.0, "grad_norm": 1.7595227960044768, "language_loss": 0.87530363, "learning_rate": 3.9761413912095075e-06, "loss": 0.89788938, "num_input_tokens_seen": 27510465, "step": 1293, "time_per_iteration": 2.801603078842163 }, { "auxiliary_loss_clip": 0.01116633, "auxiliary_loss_mlp": 0.01078659, "balance_loss_clip": 1.05041039, "balance_loss_mlp": 1.05012059, "epoch": 0.07779948895235232, "flos": 27490264871040.0, "grad_norm": 2.2898991349098528, "language_loss": 0.84518278, "learning_rate": 3.976081376263239e-06, "loss": 0.8671357, "num_input_tokens_seen": 27528645, "step": 1294, "time_per_iteration": 2.898597002029419 }, { "auxiliary_loss_clip": 0.01158796, "auxiliary_loss_mlp": 0.01059505, "balance_loss_clip": 1.05967593, "balance_loss_mlp": 1.0342207, "epoch": 0.07785961220502029, "flos": 18223301301120.0, "grad_norm": 2.7292442592472073, "language_loss": 0.79365373, "learning_rate": 3.976021286383768e-06, "loss": 0.81583679, "num_input_tokens_seen": 27546165, "step": 1295, "time_per_iteration": 2.8481552600860596 }, { "auxiliary_loss_clip": 0.01155886, "auxiliary_loss_mlp": 0.01061351, "balance_loss_clip": 1.06015158, "balance_loss_mlp": 1.0356493, "epoch": 0.07791973545768827, "flos": 24608218245120.0, "grad_norm": 3.472740252224496, "language_loss": 0.88351864, "learning_rate": 3.975961121573371e-06, "loss": 0.90569103, "num_input_tokens_seen": 27566520, "step": 1296, "time_per_iteration": 2.697831392288208 }, { "auxiliary_loss_clip": 0.0120756, "auxiliary_loss_mlp": 0.01074146, "balance_loss_clip": 1.06552935, "balance_loss_mlp": 1.04791999, "epoch": 0.07797985871035623, "flos": 14282069402880.0, "grad_norm": 2.384603846473911, "language_loss": 0.9625901, "learning_rate": 3.9759008818343305e-06, "loss": 0.98540717, "num_input_tokens_seen": 27581960, "step": 1297, "time_per_iteration": 2.62660551071167 }, { "auxiliary_loss_clip": 0.01175852, "auxiliary_loss_mlp": 0.01069298, "balance_loss_clip": 1.06147313, "balance_loss_mlp": 1.04517019, "epoch": 0.0780399819630242, "flos": 26610453141120.0, "grad_norm": 2.15152040651991, "language_loss": 0.7600193, "learning_rate": 3.97584056716893e-06, "loss": 0.78247076, "num_input_tokens_seen": 27601415, "step": 1298, "time_per_iteration": 2.8040499687194824 }, { "auxiliary_loss_clip": 0.0114505, "auxiliary_loss_mlp": 0.00783981, "balance_loss_clip": 1.05864501, "balance_loss_mlp": 1.0006063, "epoch": 0.07810010521569218, "flos": 21834514016640.0, "grad_norm": 1.6697657327886877, "language_loss": 0.8097105, "learning_rate": 3.9757801775794575e-06, "loss": 0.82900077, "num_input_tokens_seen": 27621490, "step": 1299, "time_per_iteration": 2.7667653560638428 }, { "auxiliary_loss_clip": 0.01162638, "auxiliary_loss_mlp": 0.01064395, "balance_loss_clip": 1.06191885, "balance_loss_mlp": 1.0393368, "epoch": 0.07816022846836014, "flos": 25081233471360.0, "grad_norm": 1.9748762517467437, "language_loss": 0.86755943, "learning_rate": 3.975719713068202e-06, "loss": 0.8898297, "num_input_tokens_seen": 27640600, "step": 1300, "time_per_iteration": 2.7819204330444336 }, { "auxiliary_loss_clip": 0.0120807, "auxiliary_loss_mlp": 0.01056805, "balance_loss_clip": 1.06663537, "balance_loss_mlp": 1.03180683, "epoch": 0.0782203517210281, "flos": 40917515431680.0, "grad_norm": 3.040560411644486, "language_loss": 0.71822268, "learning_rate": 3.975659173637458e-06, "loss": 0.74087137, "num_input_tokens_seen": 27663070, "step": 1301, "time_per_iteration": 2.845107316970825 }, { "auxiliary_loss_clip": 0.01196566, "auxiliary_loss_mlp": 0.01075534, "balance_loss_clip": 1.06426311, "balance_loss_mlp": 1.05100083, "epoch": 0.07828047497369607, "flos": 41172014269440.0, "grad_norm": 1.6425838754876312, "language_loss": 0.70782864, "learning_rate": 3.97559855928952e-06, "loss": 0.73054957, "num_input_tokens_seen": 27686425, "step": 1302, "time_per_iteration": 2.898069381713867 }, { "auxiliary_loss_clip": 0.01162032, "auxiliary_loss_mlp": 0.00783256, "balance_loss_clip": 1.06019354, "balance_loss_mlp": 1.00062823, "epoch": 0.07834059822636405, "flos": 23508130360320.0, "grad_norm": 2.067506704059933, "language_loss": 0.82100385, "learning_rate": 3.9755378700266864e-06, "loss": 0.84045678, "num_input_tokens_seen": 27704900, "step": 1303, "time_per_iteration": 2.7862839698791504 }, { "auxiliary_loss_clip": 0.01191742, "auxiliary_loss_mlp": 0.01074585, "balance_loss_clip": 1.06583321, "balance_loss_mlp": 1.04908574, "epoch": 0.07840072147903202, "flos": 20193899293440.0, "grad_norm": 1.8830773419754625, "language_loss": 0.75206572, "learning_rate": 3.9754771058512585e-06, "loss": 0.77472901, "num_input_tokens_seen": 27724890, "step": 1304, "time_per_iteration": 2.7380170822143555 }, { "auxiliary_loss_clip": 0.01211207, "auxiliary_loss_mlp": 0.01074343, "balance_loss_clip": 1.07114935, "balance_loss_mlp": 1.04922605, "epoch": 0.07846084473169998, "flos": 21360816432000.0, "grad_norm": 1.6118444643214749, "language_loss": 0.76141047, "learning_rate": 3.975416266765542e-06, "loss": 0.784266, "num_input_tokens_seen": 27743115, "step": 1305, "time_per_iteration": 2.6788928508758545 }, { "auxiliary_loss_clip": 0.01137547, "auxiliary_loss_mlp": 0.01064795, "balance_loss_clip": 1.05611205, "balance_loss_mlp": 1.04021358, "epoch": 0.07852096798436796, "flos": 25410965345280.0, "grad_norm": 1.9541638070229452, "language_loss": 0.85011744, "learning_rate": 3.975355352771841e-06, "loss": 0.87214082, "num_input_tokens_seen": 27763570, "step": 1306, "time_per_iteration": 3.048137903213501 }, { "auxiliary_loss_clip": 0.01194779, "auxiliary_loss_mlp": 0.01049822, "balance_loss_clip": 1.06754708, "balance_loss_mlp": 1.02668333, "epoch": 0.07858109123703592, "flos": 24571481610240.0, "grad_norm": 6.108459548145404, "language_loss": 0.90882134, "learning_rate": 3.975294363872468e-06, "loss": 0.93126732, "num_input_tokens_seen": 27780030, "step": 1307, "time_per_iteration": 3.1597135066986084 }, { "auxiliary_loss_clip": 0.01145989, "auxiliary_loss_mlp": 0.01060478, "balance_loss_clip": 1.05529833, "balance_loss_mlp": 1.034729, "epoch": 0.07864121448970389, "flos": 20698874645760.0, "grad_norm": 3.4991416096159136, "language_loss": 0.83695096, "learning_rate": 3.975233300069735e-06, "loss": 0.85901558, "num_input_tokens_seen": 27796225, "step": 1308, "time_per_iteration": 2.749174118041992 }, { "auxiliary_loss_clip": 0.01151044, "auxiliary_loss_mlp": 0.01061966, "balance_loss_clip": 1.05445218, "balance_loss_mlp": 1.03789735, "epoch": 0.07870133774237187, "flos": 22966526113920.0, "grad_norm": 1.7092634116882437, "language_loss": 0.77521002, "learning_rate": 3.975172161365958e-06, "loss": 0.7973401, "num_input_tokens_seen": 27815975, "step": 1309, "time_per_iteration": 2.752854108810425 }, { "auxiliary_loss_clip": 0.01200102, "auxiliary_loss_mlp": 0.01070583, "balance_loss_clip": 1.06396675, "balance_loss_mlp": 1.04449987, "epoch": 0.07876146099503983, "flos": 18842832103680.0, "grad_norm": 1.8729662604656268, "language_loss": 0.80561006, "learning_rate": 3.975110947763453e-06, "loss": 0.82831693, "num_input_tokens_seen": 27832255, "step": 1310, "time_per_iteration": 2.6966710090637207 }, { "auxiliary_loss_clip": 0.01173381, "auxiliary_loss_mlp": 0.0078245, "balance_loss_clip": 1.06193507, "balance_loss_mlp": 1.00060987, "epoch": 0.0788215842477078, "flos": 23805794367360.0, "grad_norm": 1.796715978968241, "language_loss": 0.73187977, "learning_rate": 3.9750496592645435e-06, "loss": 0.75143808, "num_input_tokens_seen": 27852180, "step": 1311, "time_per_iteration": 2.7588090896606445 }, { "auxiliary_loss_clip": 0.01188438, "auxiliary_loss_mlp": 0.01078546, "balance_loss_clip": 1.06358969, "balance_loss_mlp": 1.05342865, "epoch": 0.07888170750037576, "flos": 21579907438080.0, "grad_norm": 1.7490617386556226, "language_loss": 0.86002982, "learning_rate": 3.974988295871553e-06, "loss": 0.88269973, "num_input_tokens_seen": 27871435, "step": 1312, "time_per_iteration": 2.6969683170318604 }, { "auxiliary_loss_clip": 0.01178338, "auxiliary_loss_mlp": 0.01059112, "balance_loss_clip": 1.06324685, "balance_loss_mlp": 1.03633142, "epoch": 0.07894183075304374, "flos": 19864849777920.0, "grad_norm": 1.825664315845032, "language_loss": 0.82087892, "learning_rate": 3.9749268575868085e-06, "loss": 0.84325337, "num_input_tokens_seen": 27890625, "step": 1313, "time_per_iteration": 2.6936304569244385 }, { "auxiliary_loss_clip": 0.01184798, "auxiliary_loss_mlp": 0.00783631, "balance_loss_clip": 1.06229842, "balance_loss_mlp": 1.00053823, "epoch": 0.07900195400571171, "flos": 16143463071360.0, "grad_norm": 2.837190319075622, "language_loss": 0.73569417, "learning_rate": 3.97486534441264e-06, "loss": 0.75537837, "num_input_tokens_seen": 27906530, "step": 1314, "time_per_iteration": 2.653505325317383 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.00782352, "balance_loss_clip": 1.05730104, "balance_loss_mlp": 1.00044668, "epoch": 0.07906207725837967, "flos": 23730417676800.0, "grad_norm": 1.6153694611764058, "language_loss": 0.79490477, "learning_rate": 3.974803756351379e-06, "loss": 0.81427419, "num_input_tokens_seen": 27926725, "step": 1315, "time_per_iteration": 2.797306776046753 }, { "auxiliary_loss_clip": 0.01189107, "auxiliary_loss_mlp": 0.01060743, "balance_loss_clip": 1.05841756, "balance_loss_mlp": 1.03487444, "epoch": 0.07912220051104765, "flos": 24315905364480.0, "grad_norm": 1.6362349035659796, "language_loss": 0.73546493, "learning_rate": 3.974742093405362e-06, "loss": 0.75796348, "num_input_tokens_seen": 27947875, "step": 1316, "time_per_iteration": 2.688997507095337 }, { "auxiliary_loss_clip": 0.01162651, "auxiliary_loss_mlp": 0.01066617, "balance_loss_clip": 1.05845332, "balance_loss_mlp": 1.0418098, "epoch": 0.07918232376371562, "flos": 18880035615360.0, "grad_norm": 2.157376902111077, "language_loss": 0.65540409, "learning_rate": 3.974680355576927e-06, "loss": 0.67769682, "num_input_tokens_seen": 27965040, "step": 1317, "time_per_iteration": 2.6998519897460938 }, { "auxiliary_loss_clip": 0.01177674, "auxiliary_loss_mlp": 0.01068635, "balance_loss_clip": 1.06280386, "balance_loss_mlp": 1.0428021, "epoch": 0.07924244701638358, "flos": 27376284038400.0, "grad_norm": 2.382161374765057, "language_loss": 0.73105192, "learning_rate": 3.974618542868415e-06, "loss": 0.75351495, "num_input_tokens_seen": 27985330, "step": 1318, "time_per_iteration": 2.8350789546966553 }, { "auxiliary_loss_clip": 0.01139638, "auxiliary_loss_mlp": 0.01058798, "balance_loss_clip": 1.05582452, "balance_loss_mlp": 1.03515935, "epoch": 0.07930257026905156, "flos": 25120340403840.0, "grad_norm": 2.635941883481154, "language_loss": 0.90381306, "learning_rate": 3.97455665528217e-06, "loss": 0.92579746, "num_input_tokens_seen": 28007615, "step": 1319, "time_per_iteration": 2.8553895950317383 }, { "auxiliary_loss_clip": 0.01175059, "auxiliary_loss_mlp": 0.01055333, "balance_loss_clip": 1.05662942, "balance_loss_mlp": 1.03122926, "epoch": 0.07936269352171953, "flos": 21834478103040.0, "grad_norm": 1.9449065990449943, "language_loss": 0.80134505, "learning_rate": 3.974494692820539e-06, "loss": 0.82364893, "num_input_tokens_seen": 28027765, "step": 1320, "time_per_iteration": 2.6651997566223145 }, { "auxiliary_loss_clip": 0.01181808, "auxiliary_loss_mlp": 0.01060151, "balance_loss_clip": 1.06380332, "balance_loss_mlp": 1.03657198, "epoch": 0.07942281677438749, "flos": 16939889377920.0, "grad_norm": 2.1078540484546746, "language_loss": 0.6901226, "learning_rate": 3.974432655485872e-06, "loss": 0.71254218, "num_input_tokens_seen": 28044225, "step": 1321, "time_per_iteration": 2.6500401496887207 }, { "auxiliary_loss_clip": 0.01189002, "auxiliary_loss_mlp": 0.01060598, "balance_loss_clip": 1.06469131, "balance_loss_mlp": 1.03688753, "epoch": 0.07948294002705546, "flos": 18986941468800.0, "grad_norm": 1.9310950096267907, "language_loss": 0.8359012, "learning_rate": 3.9743705432805195e-06, "loss": 0.85839725, "num_input_tokens_seen": 28062915, "step": 1322, "time_per_iteration": 2.684978723526001 }, { "auxiliary_loss_clip": 0.01202147, "auxiliary_loss_mlp": 0.01057117, "balance_loss_clip": 1.06135976, "balance_loss_mlp": 1.03304851, "epoch": 0.07954306327972344, "flos": 21653452535040.0, "grad_norm": 2.128262121046283, "language_loss": 0.90555447, "learning_rate": 3.974308356206838e-06, "loss": 0.92814714, "num_input_tokens_seen": 28082175, "step": 1323, "time_per_iteration": 2.6192240715026855 }, { "auxiliary_loss_clip": 0.01164151, "auxiliary_loss_mlp": 0.01062303, "balance_loss_clip": 1.06272292, "balance_loss_mlp": 1.03809166, "epoch": 0.0796031865323914, "flos": 23220270766080.0, "grad_norm": 1.8373443631598505, "language_loss": 0.82521075, "learning_rate": 3.974246094267187e-06, "loss": 0.84747529, "num_input_tokens_seen": 28102645, "step": 1324, "time_per_iteration": 2.8283956050872803 }, { "auxiliary_loss_clip": 0.01180787, "auxiliary_loss_mlp": 0.01053463, "balance_loss_clip": 1.06256735, "balance_loss_mlp": 1.02834535, "epoch": 0.07966330978505937, "flos": 23294534135040.0, "grad_norm": 2.119290865165494, "language_loss": 0.79162025, "learning_rate": 3.974183757463925e-06, "loss": 0.8139627, "num_input_tokens_seen": 28122805, "step": 1325, "time_per_iteration": 2.6996092796325684 }, { "auxiliary_loss_clip": 0.01119286, "auxiliary_loss_mlp": 0.00785175, "balance_loss_clip": 1.04844928, "balance_loss_mlp": 1.00035501, "epoch": 0.07972343303772735, "flos": 18363783392640.0, "grad_norm": 2.2621745256944448, "language_loss": 0.88038248, "learning_rate": 3.974121345799418e-06, "loss": 0.89942712, "num_input_tokens_seen": 28140530, "step": 1326, "time_per_iteration": 2.881410837173462 }, { "auxiliary_loss_clip": 0.012, "auxiliary_loss_mlp": 0.01056877, "balance_loss_clip": 1.06257951, "balance_loss_mlp": 1.03168797, "epoch": 0.07978355629039531, "flos": 21762513204480.0, "grad_norm": 1.8538865301137586, "language_loss": 0.8328709, "learning_rate": 3.974058859276032e-06, "loss": 0.85543966, "num_input_tokens_seen": 28159640, "step": 1327, "time_per_iteration": 2.7277982234954834 }, { "auxiliary_loss_clip": 0.01207207, "auxiliary_loss_mlp": 0.01056886, "balance_loss_clip": 1.06532371, "balance_loss_mlp": 1.03223395, "epoch": 0.07984367954306328, "flos": 18551309322240.0, "grad_norm": 2.3216818645515636, "language_loss": 0.78599, "learning_rate": 3.9739962978961354e-06, "loss": 0.80863088, "num_input_tokens_seen": 28177050, "step": 1328, "time_per_iteration": 4.2137157917022705 }, { "auxiliary_loss_clip": 0.01201442, "auxiliary_loss_mlp": 0.01052053, "balance_loss_clip": 1.06778932, "balance_loss_mlp": 1.02722156, "epoch": 0.07990380279573125, "flos": 16904050583040.0, "grad_norm": 4.209530911932697, "language_loss": 0.73918134, "learning_rate": 3.973933661662101e-06, "loss": 0.76171625, "num_input_tokens_seen": 28193245, "step": 1329, "time_per_iteration": 5.853717565536499 }, { "auxiliary_loss_clip": 0.01169795, "auxiliary_loss_mlp": 0.01064631, "balance_loss_clip": 1.06039059, "balance_loss_mlp": 1.04069376, "epoch": 0.07996392604839922, "flos": 24098358643200.0, "grad_norm": 1.6102544328312476, "language_loss": 0.81743932, "learning_rate": 3.973870950576305e-06, "loss": 0.83978355, "num_input_tokens_seen": 28213570, "step": 1330, "time_per_iteration": 4.307915687561035 }, { "auxiliary_loss_clip": 0.01205148, "auxiliary_loss_mlp": 0.00780735, "balance_loss_clip": 1.06445098, "balance_loss_mlp": 1.00030971, "epoch": 0.08002404930106718, "flos": 14278729438080.0, "grad_norm": 3.0935981151455865, "language_loss": 0.88962448, "learning_rate": 3.9738081646411255e-06, "loss": 0.90948325, "num_input_tokens_seen": 28229980, "step": 1331, "time_per_iteration": 2.645198345184326 }, { "auxiliary_loss_clip": 0.01196019, "auxiliary_loss_mlp": 0.00781409, "balance_loss_clip": 1.05950165, "balance_loss_mlp": 1.00032377, "epoch": 0.08008417255373516, "flos": 40406219285760.0, "grad_norm": 1.8933982437719925, "language_loss": 0.7335732, "learning_rate": 3.973745303858942e-06, "loss": 0.75334752, "num_input_tokens_seen": 28253840, "step": 1332, "time_per_iteration": 2.792128562927246 }, { "auxiliary_loss_clip": 0.01180359, "auxiliary_loss_mlp": 0.01055118, "balance_loss_clip": 1.06217384, "balance_loss_mlp": 1.03216982, "epoch": 0.08014429580640313, "flos": 18478913460480.0, "grad_norm": 1.7464568676953767, "language_loss": 0.82765031, "learning_rate": 3.973682368232138e-06, "loss": 0.85000509, "num_input_tokens_seen": 28271675, "step": 1333, "time_per_iteration": 2.635579824447632 }, { "auxiliary_loss_clip": 0.01160554, "auxiliary_loss_mlp": 0.01059025, "balance_loss_clip": 1.05944169, "balance_loss_mlp": 1.03502798, "epoch": 0.0802044190590711, "flos": 22053461368320.0, "grad_norm": 2.677615191761892, "language_loss": 0.74862051, "learning_rate": 3.9736193577631015e-06, "loss": 0.77081633, "num_input_tokens_seen": 28291850, "step": 1334, "time_per_iteration": 2.8150298595428467 }, { "auxiliary_loss_clip": 0.01176175, "auxiliary_loss_mlp": 0.01063593, "balance_loss_clip": 1.06460369, "balance_loss_mlp": 1.04010868, "epoch": 0.08026454231173906, "flos": 24572128055040.0, "grad_norm": 1.8723728369534094, "language_loss": 0.79970533, "learning_rate": 3.973556272454221e-06, "loss": 0.82210302, "num_input_tokens_seen": 28310780, "step": 1335, "time_per_iteration": 2.6858503818511963 }, { "auxiliary_loss_clip": 0.01068232, "auxiliary_loss_mlp": 0.01020395, "balance_loss_clip": 1.04101062, "balance_loss_mlp": 1.01693749, "epoch": 0.08032466556440704, "flos": 52581841459200.0, "grad_norm": 0.7491611763509133, "language_loss": 0.56056821, "learning_rate": 3.973493112307889e-06, "loss": 0.58145452, "num_input_tokens_seen": 28369985, "step": 1336, "time_per_iteration": 3.324230670928955 }, { "auxiliary_loss_clip": 0.01179495, "auxiliary_loss_mlp": 0.01064433, "balance_loss_clip": 1.06005239, "balance_loss_mlp": 1.04149771, "epoch": 0.080384788817075, "flos": 23842602829440.0, "grad_norm": 2.8990759307469256, "language_loss": 0.67587668, "learning_rate": 3.9734298773265005e-06, "loss": 0.69831598, "num_input_tokens_seen": 28388670, "step": 1337, "time_per_iteration": 2.755451202392578 }, { "auxiliary_loss_clip": 0.01171763, "auxiliary_loss_mlp": 0.0107788, "balance_loss_clip": 1.06270492, "balance_loss_mlp": 1.05304837, "epoch": 0.08044491206974297, "flos": 25300719527040.0, "grad_norm": 1.9421039451316542, "language_loss": 0.86847901, "learning_rate": 3.973366567512453e-06, "loss": 0.89097536, "num_input_tokens_seen": 28411845, "step": 1338, "time_per_iteration": 2.758418560028076 }, { "auxiliary_loss_clip": 0.01136344, "auxiliary_loss_mlp": 0.01082295, "balance_loss_clip": 1.04883683, "balance_loss_mlp": 1.05596161, "epoch": 0.08050503532241095, "flos": 22376549226240.0, "grad_norm": 2.4557709650828157, "language_loss": 0.87217385, "learning_rate": 3.973303182868147e-06, "loss": 0.89436018, "num_input_tokens_seen": 28427875, "step": 1339, "time_per_iteration": 2.72682785987854 }, { "auxiliary_loss_clip": 0.01188632, "auxiliary_loss_mlp": 0.01055953, "balance_loss_clip": 1.06334567, "balance_loss_mlp": 1.03417385, "epoch": 0.08056515857507891, "flos": 18369421827840.0, "grad_norm": 10.603370056653041, "language_loss": 0.89504963, "learning_rate": 3.973239723395988e-06, "loss": 0.91749549, "num_input_tokens_seen": 28446615, "step": 1340, "time_per_iteration": 2.639601469039917 }, { "auxiliary_loss_clip": 0.01080107, "auxiliary_loss_mlp": 0.01012224, "balance_loss_clip": 1.02943289, "balance_loss_mlp": 1.00850451, "epoch": 0.08062528182774688, "flos": 51348130980480.0, "grad_norm": 0.8861598592181924, "language_loss": 0.64834231, "learning_rate": 3.97317618909838e-06, "loss": 0.66926563, "num_input_tokens_seen": 28505290, "step": 1341, "time_per_iteration": 3.0625648498535156 }, { "auxiliary_loss_clip": 0.01197538, "auxiliary_loss_mlp": 0.01061885, "balance_loss_clip": 1.0628854, "balance_loss_mlp": 1.0364095, "epoch": 0.08068540508041486, "flos": 17599712261760.0, "grad_norm": 3.3156125209451286, "language_loss": 0.89471233, "learning_rate": 3.973112579977733e-06, "loss": 0.9173066, "num_input_tokens_seen": 28522735, "step": 1342, "time_per_iteration": 2.6123783588409424 }, { "auxiliary_loss_clip": 0.01177687, "auxiliary_loss_mlp": 0.01062063, "balance_loss_clip": 1.0644995, "balance_loss_mlp": 1.03818512, "epoch": 0.08074552833308282, "flos": 10561185486720.0, "grad_norm": 2.2904075751929365, "language_loss": 0.76354575, "learning_rate": 3.973048896036459e-06, "loss": 0.78594327, "num_input_tokens_seen": 28539460, "step": 1343, "time_per_iteration": 2.7564918994903564 }, { "auxiliary_loss_clip": 0.01064182, "auxiliary_loss_mlp": 0.01010488, "balance_loss_clip": 1.02542567, "balance_loss_mlp": 1.0066731, "epoch": 0.08080565158575079, "flos": 60840254954880.0, "grad_norm": 0.8071281523255156, "language_loss": 0.57418531, "learning_rate": 3.972985137276974e-06, "loss": 0.59493202, "num_input_tokens_seen": 28599855, "step": 1344, "time_per_iteration": 3.170443058013916 }, { "auxiliary_loss_clip": 0.01158029, "auxiliary_loss_mlp": 0.01063108, "balance_loss_clip": 1.05839872, "balance_loss_mlp": 1.03846788, "epoch": 0.08086577483841875, "flos": 18332361970560.0, "grad_norm": 2.5953739346171676, "language_loss": 0.86569476, "learning_rate": 3.972921303701695e-06, "loss": 0.88790607, "num_input_tokens_seen": 28617585, "step": 1345, "time_per_iteration": 2.765254497528076 }, { "auxiliary_loss_clip": 0.01203428, "auxiliary_loss_mlp": 0.01057879, "balance_loss_clip": 1.06629944, "balance_loss_mlp": 1.03603959, "epoch": 0.08092589809108673, "flos": 21543601766400.0, "grad_norm": 1.8653844332842058, "language_loss": 0.87646407, "learning_rate": 3.972857395313042e-06, "loss": 0.89907712, "num_input_tokens_seen": 28636355, "step": 1346, "time_per_iteration": 2.655611991882324 }, { "auxiliary_loss_clip": 0.01191822, "auxiliary_loss_mlp": 0.0105414, "balance_loss_clip": 1.06450033, "balance_loss_mlp": 1.03047693, "epoch": 0.0809860213437547, "flos": 22128012046080.0, "grad_norm": 1.7047476553504466, "language_loss": 0.9298563, "learning_rate": 3.972793412113439e-06, "loss": 0.95231593, "num_input_tokens_seen": 28656260, "step": 1347, "time_per_iteration": 2.718355417251587 }, { "auxiliary_loss_clip": 0.01188696, "auxiliary_loss_mlp": 0.01066703, "balance_loss_clip": 1.06260633, "balance_loss_mlp": 1.04144263, "epoch": 0.08104614459642266, "flos": 21725489260800.0, "grad_norm": 1.9307860049130865, "language_loss": 0.89506733, "learning_rate": 3.972729354105312e-06, "loss": 0.91762137, "num_input_tokens_seen": 28675865, "step": 1348, "time_per_iteration": 2.763735771179199 }, { "auxiliary_loss_clip": 0.01137961, "auxiliary_loss_mlp": 0.01059733, "balance_loss_clip": 1.06026649, "balance_loss_mlp": 1.03730989, "epoch": 0.08110626784909064, "flos": 23951878980480.0, "grad_norm": 1.6214351378274148, "language_loss": 0.76906884, "learning_rate": 3.97266522129109e-06, "loss": 0.79104578, "num_input_tokens_seen": 28696255, "step": 1349, "time_per_iteration": 2.778050661087036 }, { "auxiliary_loss_clip": 0.01202122, "auxiliary_loss_mlp": 0.01065092, "balance_loss_clip": 1.06290889, "balance_loss_mlp": 1.04144049, "epoch": 0.0811663911017586, "flos": 19025689265280.0, "grad_norm": 1.777484449358279, "language_loss": 0.8877703, "learning_rate": 3.972601013673205e-06, "loss": 0.91044247, "num_input_tokens_seen": 28713905, "step": 1350, "time_per_iteration": 2.5871450901031494 }, { "auxiliary_loss_clip": 0.01164889, "auxiliary_loss_mlp": 0.00780958, "balance_loss_clip": 1.06011164, "balance_loss_mlp": 1.00028801, "epoch": 0.08122651435442657, "flos": 15341290588800.0, "grad_norm": 2.7472756845793156, "language_loss": 0.82298493, "learning_rate": 3.972536731254092e-06, "loss": 0.84244347, "num_input_tokens_seen": 28732075, "step": 1351, "time_per_iteration": 2.840271234512329 }, { "auxiliary_loss_clip": 0.01198177, "auxiliary_loss_mlp": 0.01055773, "balance_loss_clip": 1.06010592, "balance_loss_mlp": 1.03090644, "epoch": 0.08128663760709455, "flos": 23221563655680.0, "grad_norm": 2.2808101252466724, "language_loss": 0.75274944, "learning_rate": 3.972472374036189e-06, "loss": 0.775289, "num_input_tokens_seen": 28751150, "step": 1352, "time_per_iteration": 2.733644485473633 }, { "auxiliary_loss_clip": 0.01194643, "auxiliary_loss_mlp": 0.00783595, "balance_loss_clip": 1.06613326, "balance_loss_mlp": 1.00036311, "epoch": 0.08134676085976252, "flos": 22965628273920.0, "grad_norm": 1.678520960707938, "language_loss": 0.82936156, "learning_rate": 3.972407942021935e-06, "loss": 0.84914398, "num_input_tokens_seen": 28773360, "step": 1353, "time_per_iteration": 2.742149829864502 }, { "auxiliary_loss_clip": 0.01068236, "auxiliary_loss_mlp": 0.01015932, "balance_loss_clip": 1.02440155, "balance_loss_mlp": 1.01242769, "epoch": 0.08140688411243048, "flos": 64322115816960.0, "grad_norm": 0.8516312511934722, "language_loss": 0.59741521, "learning_rate": 3.972343435213775e-06, "loss": 0.61825693, "num_input_tokens_seen": 28833390, "step": 1354, "time_per_iteration": 3.1912426948547363 }, { "auxiliary_loss_clip": 0.01150343, "auxiliary_loss_mlp": 0.01058874, "balance_loss_clip": 1.0546236, "balance_loss_mlp": 1.03583086, "epoch": 0.08146700736509845, "flos": 22491858862080.0, "grad_norm": 2.1234068486581643, "language_loss": 0.82310611, "learning_rate": 3.972278853614154e-06, "loss": 0.84519827, "num_input_tokens_seen": 28852430, "step": 1355, "time_per_iteration": 2.782442808151245 }, { "auxiliary_loss_clip": 0.01186948, "auxiliary_loss_mlp": 0.01062856, "balance_loss_clip": 1.0600667, "balance_loss_mlp": 1.03801262, "epoch": 0.08152713061776642, "flos": 20447823513600.0, "grad_norm": 1.8366299277102565, "language_loss": 0.7135247, "learning_rate": 3.972214197225521e-06, "loss": 0.73602271, "num_input_tokens_seen": 28870685, "step": 1356, "time_per_iteration": 2.7777554988861084 }, { "auxiliary_loss_clip": 0.01194666, "auxiliary_loss_mlp": 0.01056522, "balance_loss_clip": 1.06462216, "balance_loss_mlp": 1.03259718, "epoch": 0.08158725387043439, "flos": 23550218121600.0, "grad_norm": 2.050923525150184, "language_loss": 0.70426142, "learning_rate": 3.972149466050329e-06, "loss": 0.72677326, "num_input_tokens_seen": 28889860, "step": 1357, "time_per_iteration": 2.852046012878418 }, { "auxiliary_loss_clip": 0.01186996, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.06138206, "balance_loss_mlp": 1.03070426, "epoch": 0.08164737712310235, "flos": 22017335264640.0, "grad_norm": 2.634204556872777, "language_loss": 0.84203482, "learning_rate": 3.97208466009103e-06, "loss": 0.8644495, "num_input_tokens_seen": 28905865, "step": 1358, "time_per_iteration": 2.7127115726470947 }, { "auxiliary_loss_clip": 0.01176629, "auxiliary_loss_mlp": 0.010566, "balance_loss_clip": 1.06037402, "balance_loss_mlp": 1.03154182, "epoch": 0.08170750037577033, "flos": 23367827836800.0, "grad_norm": 2.1726272773281097, "language_loss": 1.02781308, "learning_rate": 3.972019779350084e-06, "loss": 1.05014539, "num_input_tokens_seen": 28925250, "step": 1359, "time_per_iteration": 2.7171826362609863 }, { "auxiliary_loss_clip": 0.01128357, "auxiliary_loss_mlp": 0.01056774, "balance_loss_clip": 1.05009234, "balance_loss_mlp": 1.03263426, "epoch": 0.0817676236284383, "flos": 28397978490240.0, "grad_norm": 2.0494617207464945, "language_loss": 0.8313604, "learning_rate": 3.971954823829951e-06, "loss": 0.85321164, "num_input_tokens_seen": 28943445, "step": 1360, "time_per_iteration": 2.9020919799804688 }, { "auxiliary_loss_clip": 0.01202956, "auxiliary_loss_mlp": 0.0106887, "balance_loss_clip": 1.06274688, "balance_loss_mlp": 1.04469395, "epoch": 0.08182774688110626, "flos": 19208905562880.0, "grad_norm": 5.2377005088202075, "language_loss": 0.72322488, "learning_rate": 3.971889793533093e-06, "loss": 0.74594313, "num_input_tokens_seen": 28962695, "step": 1361, "time_per_iteration": 2.6643178462982178 }, { "auxiliary_loss_clip": 0.01166556, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.0552367, "balance_loss_mlp": 1.03184962, "epoch": 0.08188787013377424, "flos": 22784099915520.0, "grad_norm": 28.302545492028134, "language_loss": 0.76657653, "learning_rate": 3.971824688461976e-06, "loss": 0.78880513, "num_input_tokens_seen": 28982120, "step": 1362, "time_per_iteration": 2.7439064979553223 }, { "auxiliary_loss_clip": 0.01199728, "auxiliary_loss_mlp": 0.01053492, "balance_loss_clip": 1.06350708, "balance_loss_mlp": 1.03104496, "epoch": 0.08194799338644221, "flos": 16468095214080.0, "grad_norm": 2.1850191919210338, "language_loss": 0.72384715, "learning_rate": 3.971759508619069e-06, "loss": 0.74637932, "num_input_tokens_seen": 28998100, "step": 1363, "time_per_iteration": 2.7082791328430176 }, { "auxiliary_loss_clip": 0.01202887, "auxiliary_loss_mlp": 0.01066374, "balance_loss_clip": 1.06580126, "balance_loss_mlp": 1.04083955, "epoch": 0.08200811663911017, "flos": 23913633974400.0, "grad_norm": 2.142285699657122, "language_loss": 0.7726444, "learning_rate": 3.971694254006844e-06, "loss": 0.79533696, "num_input_tokens_seen": 29017095, "step": 1364, "time_per_iteration": 2.777156114578247 }, { "auxiliary_loss_clip": 0.01135428, "auxiliary_loss_mlp": 0.01063854, "balance_loss_clip": 1.05182433, "balance_loss_mlp": 1.03645968, "epoch": 0.08206823989177814, "flos": 17896550256000.0, "grad_norm": 1.85589982882842, "language_loss": 0.82242119, "learning_rate": 3.971628924627776e-06, "loss": 0.844414, "num_input_tokens_seen": 29037240, "step": 1365, "time_per_iteration": 2.8192803859710693 }, { "auxiliary_loss_clip": 0.01196582, "auxiliary_loss_mlp": 0.01059945, "balance_loss_clip": 1.07006347, "balance_loss_mlp": 1.03706884, "epoch": 0.08212836314444612, "flos": 22088186841600.0, "grad_norm": 1.7803424706125983, "language_loss": 0.82062519, "learning_rate": 3.97156352048434e-06, "loss": 0.84319043, "num_input_tokens_seen": 29056250, "step": 1366, "time_per_iteration": 2.7482311725616455 }, { "auxiliary_loss_clip": 0.01153262, "auxiliary_loss_mlp": 0.0107233, "balance_loss_clip": 1.05320215, "balance_loss_mlp": 1.04779685, "epoch": 0.08218848639711408, "flos": 17597485618560.0, "grad_norm": 2.010209091244133, "language_loss": 0.81944495, "learning_rate": 3.97149804157902e-06, "loss": 0.84170091, "num_input_tokens_seen": 29073380, "step": 1367, "time_per_iteration": 4.352729797363281 }, { "auxiliary_loss_clip": 0.01206125, "auxiliary_loss_mlp": 0.01066888, "balance_loss_clip": 1.06541765, "balance_loss_mlp": 1.04241478, "epoch": 0.08224860964978205, "flos": 17857838373120.0, "grad_norm": 2.518996379768439, "language_loss": 0.8331567, "learning_rate": 3.9714324879142946e-06, "loss": 0.85588682, "num_input_tokens_seen": 29091330, "step": 1368, "time_per_iteration": 6.077457666397095 }, { "auxiliary_loss_clip": 0.01159992, "auxiliary_loss_mlp": 0.01049874, "balance_loss_clip": 1.06314564, "balance_loss_mlp": 1.02790344, "epoch": 0.08230873290245003, "flos": 25227533566080.0, "grad_norm": 3.198110530618569, "language_loss": 0.81336468, "learning_rate": 3.971366859492653e-06, "loss": 0.8354634, "num_input_tokens_seen": 29110375, "step": 1369, "time_per_iteration": 2.769972085952759 }, { "auxiliary_loss_clip": 0.01137456, "auxiliary_loss_mlp": 0.00781814, "balance_loss_clip": 1.05438268, "balance_loss_mlp": 1.00027657, "epoch": 0.08236885615511799, "flos": 31759935753600.0, "grad_norm": 2.610758273724768, "language_loss": 0.74818152, "learning_rate": 3.971301156316582e-06, "loss": 0.76737428, "num_input_tokens_seen": 29129395, "step": 1370, "time_per_iteration": 4.497304201126099 }, { "auxiliary_loss_clip": 0.0115498, "auxiliary_loss_mlp": 0.01064278, "balance_loss_clip": 1.06403351, "balance_loss_mlp": 1.03987551, "epoch": 0.08242897940778596, "flos": 23185832601600.0, "grad_norm": 1.5246391685186451, "language_loss": 0.7398203, "learning_rate": 3.971235378388573e-06, "loss": 0.76201284, "num_input_tokens_seen": 29148650, "step": 1371, "time_per_iteration": 2.758089065551758 }, { "auxiliary_loss_clip": 0.01097162, "auxiliary_loss_mlp": 0.0106614, "balance_loss_clip": 1.05124569, "balance_loss_mlp": 1.04098701, "epoch": 0.08248910266045394, "flos": 34491480393600.0, "grad_norm": 1.9670948823939327, "language_loss": 0.70851803, "learning_rate": 3.971169525711122e-06, "loss": 0.73015106, "num_input_tokens_seen": 29170785, "step": 1372, "time_per_iteration": 4.069301605224609 }, { "auxiliary_loss_clip": 0.01162292, "auxiliary_loss_mlp": 0.01056859, "balance_loss_clip": 1.0571332, "balance_loss_mlp": 1.03261209, "epoch": 0.0825492259131219, "flos": 13436228960640.0, "grad_norm": 2.750431245604494, "language_loss": 0.88363653, "learning_rate": 3.9711035982867246e-06, "loss": 0.905828, "num_input_tokens_seen": 29185210, "step": 1373, "time_per_iteration": 3.9346964359283447 }, { "auxiliary_loss_clip": 0.01147291, "auxiliary_loss_mlp": 0.01062343, "balance_loss_clip": 1.05334187, "balance_loss_mlp": 1.03878665, "epoch": 0.08260934916578987, "flos": 25812446636160.0, "grad_norm": 2.128923272573014, "language_loss": 0.82465184, "learning_rate": 3.971037596117882e-06, "loss": 0.84674811, "num_input_tokens_seen": 29205210, "step": 1374, "time_per_iteration": 2.933377981185913 }, { "auxiliary_loss_clip": 0.01044322, "auxiliary_loss_mlp": 0.01017124, "balance_loss_clip": 1.03154135, "balance_loss_mlp": 1.0135479, "epoch": 0.08266947241845783, "flos": 63460009491840.0, "grad_norm": 0.8272339650193923, "language_loss": 0.60641956, "learning_rate": 3.970971519207095e-06, "loss": 0.62703401, "num_input_tokens_seen": 29265350, "step": 1375, "time_per_iteration": 3.3287038803100586 }, { "auxiliary_loss_clip": 0.01060461, "auxiliary_loss_mlp": 0.01013653, "balance_loss_clip": 1.02398169, "balance_loss_mlp": 1.01017237, "epoch": 0.08272959567112581, "flos": 69993704568960.0, "grad_norm": 0.9162492148708097, "language_loss": 0.62171799, "learning_rate": 3.970905367556871e-06, "loss": 0.64245915, "num_input_tokens_seen": 29321475, "step": 1376, "time_per_iteration": 3.218834161758423 }, { "auxiliary_loss_clip": 0.01159103, "auxiliary_loss_mlp": 0.0106347, "balance_loss_clip": 1.06229186, "balance_loss_mlp": 1.03942561, "epoch": 0.08278971892379378, "flos": 20413205781120.0, "grad_norm": 1.9191670647860084, "language_loss": 0.82577401, "learning_rate": 3.970839141169718e-06, "loss": 0.84799975, "num_input_tokens_seen": 29341405, "step": 1377, "time_per_iteration": 2.8763558864593506 }, { "auxiliary_loss_clip": 0.01176967, "auxiliary_loss_mlp": 0.01054072, "balance_loss_clip": 1.06486619, "balance_loss_mlp": 1.03011107, "epoch": 0.08284984217646174, "flos": 26250233598720.0, "grad_norm": 1.915539507671093, "language_loss": 0.84923226, "learning_rate": 3.970772840048147e-06, "loss": 0.87154263, "num_input_tokens_seen": 29361955, "step": 1378, "time_per_iteration": 2.8232595920562744 }, { "auxiliary_loss_clip": 0.01185329, "auxiliary_loss_mlp": 0.01058999, "balance_loss_clip": 1.06043923, "balance_loss_mlp": 1.0344305, "epoch": 0.08290996542912972, "flos": 27194683852800.0, "grad_norm": 6.4689921779024795, "language_loss": 0.87319231, "learning_rate": 3.970706464194672e-06, "loss": 0.8956356, "num_input_tokens_seen": 29382395, "step": 1379, "time_per_iteration": 2.756082534790039 }, { "auxiliary_loss_clip": 0.01158173, "auxiliary_loss_mlp": 0.01061479, "balance_loss_clip": 1.05779433, "balance_loss_mlp": 1.03829277, "epoch": 0.08297008868179769, "flos": 38618191146240.0, "grad_norm": 2.078993196749275, "language_loss": 0.78545237, "learning_rate": 3.970640013611812e-06, "loss": 0.8076489, "num_input_tokens_seen": 29404460, "step": 1380, "time_per_iteration": 2.9525601863861084 }, { "auxiliary_loss_clip": 0.01183492, "auxiliary_loss_mlp": 0.01059448, "balance_loss_clip": 1.06308961, "balance_loss_mlp": 1.0344255, "epoch": 0.08303021193446565, "flos": 19974736460160.0, "grad_norm": 2.6608111668609697, "language_loss": 0.86125714, "learning_rate": 3.970573488302083e-06, "loss": 0.88368654, "num_input_tokens_seen": 29422675, "step": 1381, "time_per_iteration": 2.735203742980957 }, { "auxiliary_loss_clip": 0.01197152, "auxiliary_loss_mlp": 0.00781814, "balance_loss_clip": 1.06611753, "balance_loss_mlp": 1.00034571, "epoch": 0.08309033518713363, "flos": 13662646341120.0, "grad_norm": 2.9433398182948203, "language_loss": 0.87471211, "learning_rate": 3.970506888268011e-06, "loss": 0.89450181, "num_input_tokens_seen": 29439840, "step": 1382, "time_per_iteration": 2.6392617225646973 }, { "auxiliary_loss_clip": 0.0115996, "auxiliary_loss_mlp": 0.01055463, "balance_loss_clip": 1.06138313, "balance_loss_mlp": 1.03337312, "epoch": 0.0831504584398016, "flos": 17968551068160.0, "grad_norm": 1.9901989904031434, "language_loss": 0.77085757, "learning_rate": 3.970440213512121e-06, "loss": 0.79301178, "num_input_tokens_seen": 29457360, "step": 1383, "time_per_iteration": 2.756565809249878 }, { "auxiliary_loss_clip": 0.01191549, "auxiliary_loss_mlp": 0.01058014, "balance_loss_clip": 1.06211782, "balance_loss_mlp": 1.03395748, "epoch": 0.08321058169246956, "flos": 22601386408320.0, "grad_norm": 1.818236548161018, "language_loss": 0.82858944, "learning_rate": 3.97037346403694e-06, "loss": 0.85108507, "num_input_tokens_seen": 29477040, "step": 1384, "time_per_iteration": 2.7848587036132812 }, { "auxiliary_loss_clip": 0.01148661, "auxiliary_loss_mlp": 0.01063605, "balance_loss_clip": 1.05671442, "balance_loss_mlp": 1.03610373, "epoch": 0.08327070494513754, "flos": 22850426378880.0, "grad_norm": 3.9982776391866346, "language_loss": 0.85219657, "learning_rate": 3.970306639845e-06, "loss": 0.8743192, "num_input_tokens_seen": 29492010, "step": 1385, "time_per_iteration": 2.803893566131592 }, { "auxiliary_loss_clip": 0.01157001, "auxiliary_loss_mlp": 0.01061891, "balance_loss_clip": 1.05823874, "balance_loss_mlp": 1.03750122, "epoch": 0.0833308281978055, "flos": 22782986593920.0, "grad_norm": 1.7071515381676081, "language_loss": 0.69195282, "learning_rate": 3.970239740938835e-06, "loss": 0.71414173, "num_input_tokens_seen": 29511850, "step": 1386, "time_per_iteration": 3.004786252975464 }, { "auxiliary_loss_clip": 0.01172803, "auxiliary_loss_mlp": 0.01058809, "balance_loss_clip": 1.05489016, "balance_loss_mlp": 1.03483546, "epoch": 0.08339095145047347, "flos": 20812604083200.0, "grad_norm": 1.672791522425571, "language_loss": 0.81894958, "learning_rate": 3.97017276732098e-06, "loss": 0.84126568, "num_input_tokens_seen": 29531415, "step": 1387, "time_per_iteration": 2.7678542137145996 }, { "auxiliary_loss_clip": 0.01179554, "auxiliary_loss_mlp": 0.01074251, "balance_loss_clip": 1.06179345, "balance_loss_mlp": 1.04817975, "epoch": 0.08345107470314143, "flos": 18515326872960.0, "grad_norm": 2.071322011459688, "language_loss": 0.77205479, "learning_rate": 3.970105718993978e-06, "loss": 0.7945928, "num_input_tokens_seen": 29549525, "step": 1388, "time_per_iteration": 2.8246304988861084 }, { "auxiliary_loss_clip": 0.01130856, "auxiliary_loss_mlp": 0.01062414, "balance_loss_clip": 1.05684018, "balance_loss_mlp": 1.03742766, "epoch": 0.08351119795580941, "flos": 18807567926400.0, "grad_norm": 2.0255270252506636, "language_loss": 0.79527366, "learning_rate": 3.970038595960369e-06, "loss": 0.81720638, "num_input_tokens_seen": 29568705, "step": 1389, "time_per_iteration": 2.8606414794921875 }, { "auxiliary_loss_clip": 0.01172785, "auxiliary_loss_mlp": 0.01064077, "balance_loss_clip": 1.05787444, "balance_loss_mlp": 1.03923428, "epoch": 0.08357132120847738, "flos": 18441817689600.0, "grad_norm": 2.546615132743645, "language_loss": 0.87427586, "learning_rate": 3.969971398222699e-06, "loss": 0.89664447, "num_input_tokens_seen": 29585855, "step": 1390, "time_per_iteration": 2.795931577682495 }, { "auxiliary_loss_clip": 0.01160426, "auxiliary_loss_mlp": 0.01067723, "balance_loss_clip": 1.05447149, "balance_loss_mlp": 1.04082966, "epoch": 0.08363144446114534, "flos": 25922333318400.0, "grad_norm": 1.8703157168219726, "language_loss": 0.86833143, "learning_rate": 3.969904125783517e-06, "loss": 0.89061296, "num_input_tokens_seen": 29607280, "step": 1391, "time_per_iteration": 2.811598062515259 }, { "auxiliary_loss_clip": 0.01156119, "auxiliary_loss_mlp": 0.01076482, "balance_loss_clip": 1.05575848, "balance_loss_mlp": 1.05180562, "epoch": 0.08369156771381332, "flos": 18041306065920.0, "grad_norm": 3.7979396758909263, "language_loss": 0.87688571, "learning_rate": 3.969836778645371e-06, "loss": 0.89921176, "num_input_tokens_seen": 29624130, "step": 1392, "time_per_iteration": 2.776819944381714 }, { "auxiliary_loss_clip": 0.01183316, "auxiliary_loss_mlp": 0.01058545, "balance_loss_clip": 1.05830503, "balance_loss_mlp": 1.03500128, "epoch": 0.08375169096648129, "flos": 22675111073280.0, "grad_norm": 8.95243370865895, "language_loss": 0.80574775, "learning_rate": 3.969769356810819e-06, "loss": 0.82816637, "num_input_tokens_seen": 29643210, "step": 1393, "time_per_iteration": 2.735761880874634 }, { "auxiliary_loss_clip": 0.01197686, "auxiliary_loss_mlp": 0.01058125, "balance_loss_clip": 1.06329441, "balance_loss_mlp": 1.03466487, "epoch": 0.08381181421914925, "flos": 26103215232000.0, "grad_norm": 1.7485261130451684, "language_loss": 0.85064757, "learning_rate": 3.969701860282415e-06, "loss": 0.87320572, "num_input_tokens_seen": 29663920, "step": 1394, "time_per_iteration": 2.950211524963379 }, { "auxiliary_loss_clip": 0.01145594, "auxiliary_loss_mlp": 0.01058123, "balance_loss_clip": 1.05994248, "balance_loss_mlp": 1.03432918, "epoch": 0.08387193747181723, "flos": 20629782835200.0, "grad_norm": 1.782466846937859, "language_loss": 0.82979721, "learning_rate": 3.969634289062719e-06, "loss": 0.85183442, "num_input_tokens_seen": 29683825, "step": 1395, "time_per_iteration": 2.883977174758911 }, { "auxiliary_loss_clip": 0.01187279, "auxiliary_loss_mlp": 0.00782865, "balance_loss_clip": 1.06065941, "balance_loss_mlp": 1.00028706, "epoch": 0.0839320607244852, "flos": 13443196199040.0, "grad_norm": 3.330409107955743, "language_loss": 0.82481396, "learning_rate": 3.969566643154293e-06, "loss": 0.84451544, "num_input_tokens_seen": 29698775, "step": 1396, "time_per_iteration": 2.6729378700256348 }, { "auxiliary_loss_clip": 0.0118605, "auxiliary_loss_mlp": 0.01060468, "balance_loss_clip": 1.06378388, "balance_loss_mlp": 1.03475475, "epoch": 0.08399218397715316, "flos": 23477247642240.0, "grad_norm": 1.780410555630689, "language_loss": 0.76843297, "learning_rate": 3.969498922559703e-06, "loss": 0.79089814, "num_input_tokens_seen": 29719430, "step": 1397, "time_per_iteration": 2.64888334274292 }, { "auxiliary_loss_clip": 0.01153742, "auxiliary_loss_mlp": 0.01050759, "balance_loss_clip": 1.05790138, "balance_loss_mlp": 1.02621412, "epoch": 0.08405230722982113, "flos": 25920717206400.0, "grad_norm": 2.1323769932413184, "language_loss": 0.77941638, "learning_rate": 3.969431127281516e-06, "loss": 0.8014614, "num_input_tokens_seen": 29739685, "step": 1398, "time_per_iteration": 2.8302125930786133 }, { "auxiliary_loss_clip": 0.01191086, "auxiliary_loss_mlp": 0.01052374, "balance_loss_clip": 1.05962944, "balance_loss_mlp": 1.02943766, "epoch": 0.0841124304824891, "flos": 17967437746560.0, "grad_norm": 2.150764713624159, "language_loss": 0.94635069, "learning_rate": 3.969363257322304e-06, "loss": 0.96878529, "num_input_tokens_seen": 29756165, "step": 1399, "time_per_iteration": 2.650517702102661 }, { "auxiliary_loss_clip": 0.01172403, "auxiliary_loss_mlp": 0.0106738, "balance_loss_clip": 1.0562712, "balance_loss_mlp": 1.04168999, "epoch": 0.08417255373515707, "flos": 25629661301760.0, "grad_norm": 3.6141849657848137, "language_loss": 0.81904209, "learning_rate": 3.96929531268464e-06, "loss": 0.8414399, "num_input_tokens_seen": 29776425, "step": 1400, "time_per_iteration": 2.777369260787964 }, { "auxiliary_loss_clip": 0.01170173, "auxiliary_loss_mlp": 0.01064292, "balance_loss_clip": 1.05968165, "balance_loss_mlp": 1.03957999, "epoch": 0.08423267698782504, "flos": 26249730808320.0, "grad_norm": 8.998651919840762, "language_loss": 0.8642807, "learning_rate": 3.969227293371099e-06, "loss": 0.88662529, "num_input_tokens_seen": 29796440, "step": 1401, "time_per_iteration": 2.91375732421875 }, { "auxiliary_loss_clip": 0.01196, "auxiliary_loss_mlp": 0.01066109, "balance_loss_clip": 1.05935979, "balance_loss_mlp": 1.04053831, "epoch": 0.08429280024049302, "flos": 20119707751680.0, "grad_norm": 2.9792515680869114, "language_loss": 0.87500131, "learning_rate": 3.969159199384263e-06, "loss": 0.89762247, "num_input_tokens_seen": 29814755, "step": 1402, "time_per_iteration": 2.7827296257019043 }, { "auxiliary_loss_clip": 0.01144907, "auxiliary_loss_mlp": 0.00781428, "balance_loss_clip": 1.05105817, "balance_loss_mlp": 1.00033188, "epoch": 0.08435292349316098, "flos": 42924526836480.0, "grad_norm": 2.1517994230241566, "language_loss": 0.8905524, "learning_rate": 3.9690910307267125e-06, "loss": 0.90981579, "num_input_tokens_seen": 29834785, "step": 1403, "time_per_iteration": 2.931666374206543 }, { "auxiliary_loss_clip": 0.01165276, "auxiliary_loss_mlp": 0.01061696, "balance_loss_clip": 1.05570936, "balance_loss_mlp": 1.03715038, "epoch": 0.08441304674582895, "flos": 22857285876480.0, "grad_norm": 1.790271378285476, "language_loss": 0.80321431, "learning_rate": 3.969022787401033e-06, "loss": 0.82548404, "num_input_tokens_seen": 29854695, "step": 1404, "time_per_iteration": 2.7397725582122803 }, { "auxiliary_loss_clip": 0.01181709, "auxiliary_loss_mlp": 0.01071408, "balance_loss_clip": 1.06211567, "balance_loss_mlp": 1.04649353, "epoch": 0.08447316999849692, "flos": 18697501676160.0, "grad_norm": 2.0849305916509193, "language_loss": 0.83557045, "learning_rate": 3.968954469409811e-06, "loss": 0.85810155, "num_input_tokens_seen": 29872180, "step": 1405, "time_per_iteration": 2.8052847385406494 }, { "auxiliary_loss_clip": 0.0118246, "auxiliary_loss_mlp": 0.01058347, "balance_loss_clip": 1.05636072, "balance_loss_mlp": 1.03588748, "epoch": 0.08453329325116489, "flos": 25483971738240.0, "grad_norm": 1.5225846020503528, "language_loss": 0.7991904, "learning_rate": 3.968886076755639e-06, "loss": 0.82159847, "num_input_tokens_seen": 29893205, "step": 1406, "time_per_iteration": 4.301243305206299 }, { "auxiliary_loss_clip": 0.0117117, "auxiliary_loss_mlp": 0.01068275, "balance_loss_clip": 1.05790758, "balance_loss_mlp": 1.04406369, "epoch": 0.08459341650383286, "flos": 20920048640640.0, "grad_norm": 1.717770739318623, "language_loss": 0.79441547, "learning_rate": 3.96881760944111e-06, "loss": 0.81680995, "num_input_tokens_seen": 29911970, "step": 1407, "time_per_iteration": 2.6535613536834717 }, { "auxiliary_loss_clip": 0.01186501, "auxiliary_loss_mlp": 0.01057881, "balance_loss_clip": 1.05982685, "balance_loss_mlp": 1.03409886, "epoch": 0.08465353975650082, "flos": 13043079624960.0, "grad_norm": 2.191354041218588, "language_loss": 0.91799384, "learning_rate": 3.968749067468819e-06, "loss": 0.94043779, "num_input_tokens_seen": 29929925, "step": 1408, "time_per_iteration": 5.774486064910889 }, { "auxiliary_loss_clip": 0.01058217, "auxiliary_loss_mlp": 0.01015213, "balance_loss_clip": 1.0231359, "balance_loss_mlp": 1.01139832, "epoch": 0.0847136630091688, "flos": 60877422552960.0, "grad_norm": 0.9559717259642487, "language_loss": 0.61891782, "learning_rate": 3.968680450841368e-06, "loss": 0.63965201, "num_input_tokens_seen": 29985950, "step": 1409, "time_per_iteration": 4.9455225467681885 }, { "auxiliary_loss_clip": 0.01188186, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.05840743, "balance_loss_mlp": 1.03878236, "epoch": 0.08477378626183676, "flos": 22046530043520.0, "grad_norm": 1.6980375913788566, "language_loss": 0.86357373, "learning_rate": 3.968611759561355e-06, "loss": 0.88607281, "num_input_tokens_seen": 30004330, "step": 1410, "time_per_iteration": 2.640355110168457 }, { "auxiliary_loss_clip": 0.01181512, "auxiliary_loss_mlp": 0.01053874, "balance_loss_clip": 1.0583061, "balance_loss_mlp": 1.02870846, "epoch": 0.08483390951450473, "flos": 16690059308160.0, "grad_norm": 2.248971712939306, "language_loss": 0.74384397, "learning_rate": 3.968542993631388e-06, "loss": 0.7661978, "num_input_tokens_seen": 30022555, "step": 1411, "time_per_iteration": 2.6200830936431885 }, { "auxiliary_loss_clip": 0.01077929, "auxiliary_loss_mlp": 0.01003535, "balance_loss_clip": 1.02317524, "balance_loss_mlp": 0.99991113, "epoch": 0.08489403276717271, "flos": 51584640082560.0, "grad_norm": 0.9014663966204861, "language_loss": 0.56748837, "learning_rate": 3.968474153054073e-06, "loss": 0.58830309, "num_input_tokens_seen": 30077220, "step": 1412, "time_per_iteration": 3.0746512413024902 }, { "auxiliary_loss_clip": 0.01156137, "auxiliary_loss_mlp": 0.01067795, "balance_loss_clip": 1.05325568, "balance_loss_mlp": 1.04265356, "epoch": 0.08495415601984067, "flos": 17092330698240.0, "grad_norm": 2.2757293876932945, "language_loss": 0.88754624, "learning_rate": 3.96840523783202e-06, "loss": 0.90978551, "num_input_tokens_seen": 30094600, "step": 1413, "time_per_iteration": 2.7309420108795166 }, { "auxiliary_loss_clip": 0.01164895, "auxiliary_loss_mlp": 0.01057479, "balance_loss_clip": 1.05780244, "balance_loss_mlp": 1.03295755, "epoch": 0.08501427927250864, "flos": 23148413608320.0, "grad_norm": 1.9781781646219805, "language_loss": 0.87963474, "learning_rate": 3.968336247967844e-06, "loss": 0.90185857, "num_input_tokens_seen": 30114475, "step": 1414, "time_per_iteration": 2.692030668258667 }, { "auxiliary_loss_clip": 0.01168145, "auxiliary_loss_mlp": 0.01063751, "balance_loss_clip": 1.05704033, "balance_loss_mlp": 1.04170966, "epoch": 0.08507440252517662, "flos": 19063467394560.0, "grad_norm": 1.9706021333256292, "language_loss": 0.77636635, "learning_rate": 3.96826718346416e-06, "loss": 0.79868531, "num_input_tokens_seen": 30133350, "step": 1415, "time_per_iteration": 2.8435540199279785 }, { "auxiliary_loss_clip": 0.01182108, "auxiliary_loss_mlp": 0.01059478, "balance_loss_clip": 1.0588963, "balance_loss_mlp": 1.03701878, "epoch": 0.08513452577784458, "flos": 60182296600320.0, "grad_norm": 1.7170282174092708, "language_loss": 0.70545506, "learning_rate": 3.968198044323587e-06, "loss": 0.72787094, "num_input_tokens_seen": 30159005, "step": 1416, "time_per_iteration": 3.021360158920288 }, { "auxiliary_loss_clip": 0.01174166, "auxiliary_loss_mlp": 0.01066487, "balance_loss_clip": 1.05930233, "balance_loss_mlp": 1.04131043, "epoch": 0.08519464903051255, "flos": 27308485117440.0, "grad_norm": 2.8159853289102053, "language_loss": 0.74938154, "learning_rate": 3.968128830548748e-06, "loss": 0.771788, "num_input_tokens_seen": 30179450, "step": 1417, "time_per_iteration": 2.738301992416382 }, { "auxiliary_loss_clip": 0.01171292, "auxiliary_loss_mlp": 0.01057092, "balance_loss_clip": 1.05715823, "balance_loss_mlp": 1.03313112, "epoch": 0.08525477228318051, "flos": 20266438809600.0, "grad_norm": 2.4132423968154635, "language_loss": 0.8258723, "learning_rate": 3.968059542142265e-06, "loss": 0.84815615, "num_input_tokens_seen": 30197235, "step": 1418, "time_per_iteration": 2.671574831008911 }, { "auxiliary_loss_clip": 0.0104499, "auxiliary_loss_mlp": 0.01004818, "balance_loss_clip": 1.02242994, "balance_loss_mlp": 1.0004549, "epoch": 0.08531489553584849, "flos": 67615017183360.0, "grad_norm": 0.8667411864001444, "language_loss": 0.56638753, "learning_rate": 3.9679901791067685e-06, "loss": 0.58688557, "num_input_tokens_seen": 30257410, "step": 1419, "time_per_iteration": 3.199730396270752 }, { "auxiliary_loss_clip": 0.01192231, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.05757999, "balance_loss_mlp": 1.04369283, "epoch": 0.08537501878851646, "flos": 27526965592320.0, "grad_norm": 2.2357492693560466, "language_loss": 0.70111859, "learning_rate": 3.967920741444886e-06, "loss": 0.72371829, "num_input_tokens_seen": 30277865, "step": 1420, "time_per_iteration": 2.7176027297973633 }, { "auxiliary_loss_clip": 0.01155207, "auxiliary_loss_mlp": 0.01050755, "balance_loss_clip": 1.05377483, "balance_loss_mlp": 1.02692556, "epoch": 0.08543514204118442, "flos": 22784243569920.0, "grad_norm": 1.5975069204011494, "language_loss": 0.88011539, "learning_rate": 3.967851229159252e-06, "loss": 0.90217495, "num_input_tokens_seen": 30298545, "step": 1421, "time_per_iteration": 2.7552106380462646 }, { "auxiliary_loss_clip": 0.01077473, "auxiliary_loss_mlp": 0.01013517, "balance_loss_clip": 1.02364218, "balance_loss_mlp": 1.01020324, "epoch": 0.0854952652938524, "flos": 60990721027200.0, "grad_norm": 0.9142209544576306, "language_loss": 0.63506877, "learning_rate": 3.967781642252502e-06, "loss": 0.65597868, "num_input_tokens_seen": 30361725, "step": 1422, "time_per_iteration": 3.134183168411255 }, { "auxiliary_loss_clip": 0.01152948, "auxiliary_loss_mlp": 0.01063847, "balance_loss_clip": 1.05932307, "balance_loss_mlp": 1.0406723, "epoch": 0.08555538854652037, "flos": 28038046256640.0, "grad_norm": 1.8757015124159093, "language_loss": 0.82691669, "learning_rate": 3.967711980727276e-06, "loss": 0.84908462, "num_input_tokens_seen": 30382180, "step": 1423, "time_per_iteration": 2.789393424987793 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01064169, "balance_loss_clip": 1.0526228, "balance_loss_mlp": 1.04089928, "epoch": 0.08561551179918833, "flos": 23509279595520.0, "grad_norm": 1.6593534429066656, "language_loss": 0.75424892, "learning_rate": 3.967642244586213e-06, "loss": 0.776425, "num_input_tokens_seen": 30402980, "step": 1424, "time_per_iteration": 2.7805826663970947 }, { "auxiliary_loss_clip": 0.01139579, "auxiliary_loss_mlp": 0.01060342, "balance_loss_clip": 1.05769765, "balance_loss_mlp": 1.03751373, "epoch": 0.08567563505185631, "flos": 17926930183680.0, "grad_norm": 1.7999307606718091, "language_loss": 0.75948423, "learning_rate": 3.96757243383196e-06, "loss": 0.78148341, "num_input_tokens_seen": 30420800, "step": 1425, "time_per_iteration": 2.677889823913574 }, { "auxiliary_loss_clip": 0.0118966, "auxiliary_loss_mlp": 0.01055231, "balance_loss_clip": 1.05982256, "balance_loss_mlp": 1.03230715, "epoch": 0.08573575830452428, "flos": 19719519350400.0, "grad_norm": 2.1792756220437743, "language_loss": 0.93362999, "learning_rate": 3.9675025484671624e-06, "loss": 0.95607889, "num_input_tokens_seen": 30439620, "step": 1426, "time_per_iteration": 2.6270906925201416 }, { "auxiliary_loss_clip": 0.01145994, "auxiliary_loss_mlp": 0.01066219, "balance_loss_clip": 1.05707717, "balance_loss_mlp": 1.0406251, "epoch": 0.08579588155719224, "flos": 17931563038080.0, "grad_norm": 2.3679064075186553, "language_loss": 0.75424731, "learning_rate": 3.967432588494471e-06, "loss": 0.77636945, "num_input_tokens_seen": 30457300, "step": 1427, "time_per_iteration": 2.84614634513855 }, { "auxiliary_loss_clip": 0.01190697, "auxiliary_loss_mlp": 0.01052992, "balance_loss_clip": 1.06006169, "balance_loss_mlp": 1.0305804, "epoch": 0.08585600480986022, "flos": 16033324993920.0, "grad_norm": 3.503048788198607, "language_loss": 0.82108849, "learning_rate": 3.96736255391654e-06, "loss": 0.84352541, "num_input_tokens_seen": 30471580, "step": 1428, "time_per_iteration": 2.5882396697998047 }, { "auxiliary_loss_clip": 0.01173688, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.05633736, "balance_loss_mlp": 1.03832793, "epoch": 0.08591612806252819, "flos": 28657433404800.0, "grad_norm": 2.088481658755078, "language_loss": 0.79929984, "learning_rate": 3.967292444736023e-06, "loss": 0.82166648, "num_input_tokens_seen": 30492720, "step": 1429, "time_per_iteration": 2.720500946044922 }, { "auxiliary_loss_clip": 0.01169119, "auxiliary_loss_mlp": 0.010606, "balance_loss_clip": 1.05971265, "balance_loss_mlp": 1.0379504, "epoch": 0.08597625131519615, "flos": 20959119659520.0, "grad_norm": 1.9029222975672677, "language_loss": 0.87716508, "learning_rate": 3.967222260955578e-06, "loss": 0.89946228, "num_input_tokens_seen": 30509535, "step": 1430, "time_per_iteration": 2.6914596557617188 }, { "auxiliary_loss_clip": 0.01144304, "auxiliary_loss_mlp": 0.01074633, "balance_loss_clip": 1.05802035, "balance_loss_mlp": 1.05125606, "epoch": 0.08603637456786412, "flos": 23256360956160.0, "grad_norm": 1.6366623508781384, "language_loss": 0.81859726, "learning_rate": 3.96715200257787e-06, "loss": 0.84078664, "num_input_tokens_seen": 30529490, "step": 1431, "time_per_iteration": 2.834402322769165 }, { "auxiliary_loss_clip": 0.01148362, "auxiliary_loss_mlp": 0.01054323, "balance_loss_clip": 1.05620182, "balance_loss_mlp": 1.03132737, "epoch": 0.0860964978205321, "flos": 28694170039680.0, "grad_norm": 1.5497375505717568, "language_loss": 0.78109461, "learning_rate": 3.967081669605559e-06, "loss": 0.80312145, "num_input_tokens_seen": 30550205, "step": 1432, "time_per_iteration": 2.767860174179077 }, { "auxiliary_loss_clip": 0.01167351, "auxiliary_loss_mlp": 0.0106333, "balance_loss_clip": 1.0540905, "balance_loss_mlp": 1.03914225, "epoch": 0.08615662107320006, "flos": 19318397195520.0, "grad_norm": 1.9631692713893694, "language_loss": 0.73365706, "learning_rate": 3.967011262041315e-06, "loss": 0.75596392, "num_input_tokens_seen": 30568830, "step": 1433, "time_per_iteration": 2.6930699348449707 }, { "auxiliary_loss_clip": 0.01150098, "auxiliary_loss_mlp": 0.00781967, "balance_loss_clip": 1.05335927, "balance_loss_mlp": 1.00044179, "epoch": 0.08621674432586802, "flos": 15851688894720.0, "grad_norm": 2.468588778716135, "language_loss": 0.85340321, "learning_rate": 3.9669407798878065e-06, "loss": 0.87272388, "num_input_tokens_seen": 30585730, "step": 1434, "time_per_iteration": 2.735690116882324 }, { "auxiliary_loss_clip": 0.01170363, "auxiliary_loss_mlp": 0.01057659, "balance_loss_clip": 1.05604434, "balance_loss_mlp": 1.0344249, "epoch": 0.086276867578536, "flos": 14100648785280.0, "grad_norm": 2.160640509122794, "language_loss": 0.7870298, "learning_rate": 3.966870223147707e-06, "loss": 0.80931008, "num_input_tokens_seen": 30603180, "step": 1435, "time_per_iteration": 2.776567220687866 }, { "auxiliary_loss_clip": 0.01047768, "auxiliary_loss_mlp": 0.01015597, "balance_loss_clip": 1.023893, "balance_loss_mlp": 1.01206815, "epoch": 0.08633699083120397, "flos": 70184857772160.0, "grad_norm": 0.8900716332014227, "language_loss": 0.57975936, "learning_rate": 3.96679959182369e-06, "loss": 0.60039294, "num_input_tokens_seen": 30668895, "step": 1436, "time_per_iteration": 3.344207763671875 }, { "auxiliary_loss_clip": 0.0117372, "auxiliary_loss_mlp": 0.01056829, "balance_loss_clip": 1.05617976, "balance_loss_mlp": 1.03153312, "epoch": 0.08639711408387193, "flos": 30298874140800.0, "grad_norm": 2.240343996649645, "language_loss": 0.69169062, "learning_rate": 3.966728885918437e-06, "loss": 0.71399617, "num_input_tokens_seen": 30688955, "step": 1437, "time_per_iteration": 2.7171547412872314 }, { "auxiliary_loss_clip": 0.01121044, "auxiliary_loss_mlp": 0.01055264, "balance_loss_clip": 1.05334914, "balance_loss_mlp": 1.03223276, "epoch": 0.08645723733653991, "flos": 20297680663680.0, "grad_norm": 2.1340571114707245, "language_loss": 0.72624576, "learning_rate": 3.966658105434627e-06, "loss": 0.74800885, "num_input_tokens_seen": 30706095, "step": 1438, "time_per_iteration": 2.7815651893615723 }, { "auxiliary_loss_clip": 0.01179626, "auxiliary_loss_mlp": 0.01052578, "balance_loss_clip": 1.06052637, "balance_loss_mlp": 1.02872419, "epoch": 0.08651736058920788, "flos": 32890583134080.0, "grad_norm": 1.5339762166114281, "language_loss": 0.64377135, "learning_rate": 3.966587250374945e-06, "loss": 0.66609335, "num_input_tokens_seen": 30729025, "step": 1439, "time_per_iteration": 2.8935797214508057 }, { "auxiliary_loss_clip": 0.01153286, "auxiliary_loss_mlp": 0.01056452, "balance_loss_clip": 1.05530453, "balance_loss_mlp": 1.03213322, "epoch": 0.08657748384187584, "flos": 22637368857600.0, "grad_norm": 5.193932354158579, "language_loss": 0.87521696, "learning_rate": 3.966516320742077e-06, "loss": 0.89731431, "num_input_tokens_seen": 30746155, "step": 1440, "time_per_iteration": 2.731531858444214 }, { "auxiliary_loss_clip": 0.01155923, "auxiliary_loss_mlp": 0.00782787, "balance_loss_clip": 1.05752945, "balance_loss_mlp": 1.00043201, "epoch": 0.08663760709454381, "flos": 23658380951040.0, "grad_norm": 2.023462963415533, "language_loss": 0.83434939, "learning_rate": 3.9664453165387124e-06, "loss": 0.85373652, "num_input_tokens_seen": 30761410, "step": 1441, "time_per_iteration": 2.7126500606536865 }, { "auxiliary_loss_clip": 0.01074667, "auxiliary_loss_mlp": 0.01004602, "balance_loss_clip": 1.0222367, "balance_loss_mlp": 1.00100195, "epoch": 0.08669773034721179, "flos": 62686564911360.0, "grad_norm": 0.8541685426878655, "language_loss": 0.60479522, "learning_rate": 3.966374237767545e-06, "loss": 0.62558794, "num_input_tokens_seen": 30823010, "step": 1442, "time_per_iteration": 3.25555157661438 }, { "auxiliary_loss_clip": 0.0116729, "auxiliary_loss_mlp": 0.01054262, "balance_loss_clip": 1.05768681, "balance_loss_mlp": 1.03075421, "epoch": 0.08675785359987975, "flos": 20667489137280.0, "grad_norm": 2.8449103562639073, "language_loss": 0.79304373, "learning_rate": 3.96630308443127e-06, "loss": 0.81525922, "num_input_tokens_seen": 30841980, "step": 1443, "time_per_iteration": 2.7314631938934326 }, { "auxiliary_loss_clip": 0.01180858, "auxiliary_loss_mlp": 0.01051075, "balance_loss_clip": 1.05780149, "balance_loss_mlp": 1.02755547, "epoch": 0.08681797685254772, "flos": 26941118768640.0, "grad_norm": 1.6739262813835734, "language_loss": 0.82399666, "learning_rate": 3.966231856532584e-06, "loss": 0.84631598, "num_input_tokens_seen": 30863280, "step": 1444, "time_per_iteration": 2.7341418266296387 }, { "auxiliary_loss_clip": 0.01196759, "auxiliary_loss_mlp": 0.01051473, "balance_loss_clip": 1.06044626, "balance_loss_mlp": 1.02810788, "epoch": 0.0868781001052157, "flos": 17712831168000.0, "grad_norm": 2.3015461969915747, "language_loss": 0.87354827, "learning_rate": 3.966160554074189e-06, "loss": 0.8960306, "num_input_tokens_seen": 30881710, "step": 1445, "time_per_iteration": 4.25179386138916 }, { "auxiliary_loss_clip": 0.01180784, "auxiliary_loss_mlp": 0.01055896, "balance_loss_clip": 1.06094933, "balance_loss_mlp": 1.03446186, "epoch": 0.08693822335788366, "flos": 19896522595200.0, "grad_norm": 1.8066650797875201, "language_loss": 0.81863767, "learning_rate": 3.96608917705879e-06, "loss": 0.84100449, "num_input_tokens_seen": 30900225, "step": 1446, "time_per_iteration": 4.197181940078735 }, { "auxiliary_loss_clip": 0.01056056, "auxiliary_loss_mlp": 0.01004371, "balance_loss_clip": 1.01782191, "balance_loss_mlp": 1.00031781, "epoch": 0.08699834661055163, "flos": 67023747406080.0, "grad_norm": 0.7255245569613363, "language_loss": 0.54762936, "learning_rate": 3.966017725489091e-06, "loss": 0.56823361, "num_input_tokens_seen": 30959580, "step": 1447, "time_per_iteration": 3.2158126831054688 }, { "auxiliary_loss_clip": 0.0114861, "auxiliary_loss_mlp": 0.01056824, "balance_loss_clip": 1.05373001, "balance_loss_mlp": 1.03518772, "epoch": 0.0870584698632196, "flos": 13480507451520.0, "grad_norm": 2.1586118179593696, "language_loss": 0.84592307, "learning_rate": 3.965946199367804e-06, "loss": 0.86797738, "num_input_tokens_seen": 30976775, "step": 1448, "time_per_iteration": 4.262767314910889 }, { "auxiliary_loss_clip": 0.01194173, "auxiliary_loss_mlp": 0.01050219, "balance_loss_clip": 1.05891991, "balance_loss_mlp": 1.02768826, "epoch": 0.08711859311588757, "flos": 16107013745280.0, "grad_norm": 3.4326906921347096, "language_loss": 0.80644608, "learning_rate": 3.965874598697638e-06, "loss": 0.82888997, "num_input_tokens_seen": 30990495, "step": 1449, "time_per_iteration": 4.553676128387451 }, { "auxiliary_loss_clip": 0.01138548, "auxiliary_loss_mlp": 0.01052142, "balance_loss_clip": 1.05437374, "balance_loss_mlp": 1.02946854, "epoch": 0.08717871636855554, "flos": 38472357928320.0, "grad_norm": 1.5251600336566102, "language_loss": 0.70971417, "learning_rate": 3.965802923481313e-06, "loss": 0.73162109, "num_input_tokens_seen": 31014080, "step": 1450, "time_per_iteration": 2.9082705974578857 }, { "auxiliary_loss_clip": 0.01124466, "auxiliary_loss_mlp": 0.01054883, "balance_loss_clip": 1.05164719, "balance_loss_mlp": 1.03207827, "epoch": 0.0872388396212235, "flos": 17600574188160.0, "grad_norm": 1.9392114767205617, "language_loss": 0.83684897, "learning_rate": 3.965731173721542e-06, "loss": 0.85864246, "num_input_tokens_seen": 31031210, "step": 1451, "time_per_iteration": 2.809880495071411 }, { "auxiliary_loss_clip": 0.01134251, "auxiliary_loss_mlp": 0.00780873, "balance_loss_clip": 1.05147851, "balance_loss_mlp": 1.00039482, "epoch": 0.08729896287389148, "flos": 25259385951360.0, "grad_norm": 2.5160845512367773, "language_loss": 0.74654591, "learning_rate": 3.965659349421049e-06, "loss": 0.76569718, "num_input_tokens_seen": 31049710, "step": 1452, "time_per_iteration": 2.88580060005188 }, { "auxiliary_loss_clip": 0.01157134, "auxiliary_loss_mlp": 0.01063328, "balance_loss_clip": 1.05607891, "balance_loss_mlp": 1.0388428, "epoch": 0.08735908612655945, "flos": 15632454234240.0, "grad_norm": 4.56941406999875, "language_loss": 0.80543101, "learning_rate": 3.965587450582556e-06, "loss": 0.82763565, "num_input_tokens_seen": 31066160, "step": 1453, "time_per_iteration": 2.733632802963257 }, { "auxiliary_loss_clip": 0.01169707, "auxiliary_loss_mlp": 0.01059533, "balance_loss_clip": 1.05905569, "balance_loss_mlp": 1.03625154, "epoch": 0.08741920937922741, "flos": 20339660684160.0, "grad_norm": 2.0102093196988102, "language_loss": 0.71041977, "learning_rate": 3.96551547720879e-06, "loss": 0.73271215, "num_input_tokens_seen": 31085270, "step": 1454, "time_per_iteration": 2.7568745613098145 }, { "auxiliary_loss_clip": 0.0106426, "auxiliary_loss_mlp": 0.01008112, "balance_loss_clip": 1.0215131, "balance_loss_mlp": 1.00463128, "epoch": 0.08747933263189539, "flos": 62819795433600.0, "grad_norm": 0.7713706503543015, "language_loss": 0.5859946, "learning_rate": 3.96544342930248e-06, "loss": 0.6067183, "num_input_tokens_seen": 31148445, "step": 1455, "time_per_iteration": 3.2372186183929443 }, { "auxiliary_loss_clip": 0.01189404, "auxiliary_loss_mlp": 0.01060742, "balance_loss_clip": 1.05742884, "balance_loss_mlp": 1.03688788, "epoch": 0.08753945588456336, "flos": 33035877648000.0, "grad_norm": 1.6485208275358016, "language_loss": 0.77564865, "learning_rate": 3.965371306866359e-06, "loss": 0.79815018, "num_input_tokens_seen": 31168770, "step": 1456, "time_per_iteration": 2.790663003921509 }, { "auxiliary_loss_clip": 0.01127959, "auxiliary_loss_mlp": 0.01054526, "balance_loss_clip": 1.04962158, "balance_loss_mlp": 1.03071976, "epoch": 0.08759957913723132, "flos": 35547182046720.0, "grad_norm": 1.83889407784057, "language_loss": 0.72420907, "learning_rate": 3.96529910990316e-06, "loss": 0.74603397, "num_input_tokens_seen": 31189270, "step": 1457, "time_per_iteration": 2.9099740982055664 }, { "auxiliary_loss_clip": 0.01176549, "auxiliary_loss_mlp": 0.0104866, "balance_loss_clip": 1.05627227, "balance_loss_mlp": 1.02633214, "epoch": 0.0876597023898993, "flos": 23911120022400.0, "grad_norm": 1.5250401870177361, "language_loss": 0.86412215, "learning_rate": 3.965226838415622e-06, "loss": 0.88637424, "num_input_tokens_seen": 31210385, "step": 1458, "time_per_iteration": 2.7517166137695312 }, { "auxiliary_loss_clip": 0.01169535, "auxiliary_loss_mlp": 0.01061413, "balance_loss_clip": 1.05884266, "balance_loss_mlp": 1.03825045, "epoch": 0.08771982564256726, "flos": 18114025150080.0, "grad_norm": 1.7412813512419094, "language_loss": 0.80268395, "learning_rate": 3.965154492406486e-06, "loss": 0.82499349, "num_input_tokens_seen": 31229745, "step": 1459, "time_per_iteration": 2.71455717086792 }, { "auxiliary_loss_clip": 0.01130491, "auxiliary_loss_mlp": 0.01054334, "balance_loss_clip": 1.05256546, "balance_loss_mlp": 1.03018188, "epoch": 0.08777994889523523, "flos": 17712005155200.0, "grad_norm": 2.1450339680450714, "language_loss": 0.84538847, "learning_rate": 3.9650820718784945e-06, "loss": 0.86723673, "num_input_tokens_seen": 31248280, "step": 1460, "time_per_iteration": 2.8737733364105225 }, { "auxiliary_loss_clip": 0.01177787, "auxiliary_loss_mlp": 0.01057974, "balance_loss_clip": 1.0572983, "balance_loss_mlp": 1.03640938, "epoch": 0.0878400721479032, "flos": 12819930382080.0, "grad_norm": 4.917361835698274, "language_loss": 0.79993135, "learning_rate": 3.965009576834394e-06, "loss": 0.82228899, "num_input_tokens_seen": 31262190, "step": 1461, "time_per_iteration": 2.8436062335968018 }, { "auxiliary_loss_clip": 0.01169165, "auxiliary_loss_mlp": 0.01058947, "balance_loss_clip": 1.05800629, "balance_loss_mlp": 1.03704822, "epoch": 0.08790019540057117, "flos": 26392690938240.0, "grad_norm": 1.566202508611165, "language_loss": 0.76571167, "learning_rate": 3.964937007276932e-06, "loss": 0.78799284, "num_input_tokens_seen": 31283690, "step": 1462, "time_per_iteration": 2.7895474433898926 }, { "auxiliary_loss_clip": 0.0117563, "auxiliary_loss_mlp": 0.01060064, "balance_loss_clip": 1.05839491, "balance_loss_mlp": 1.03580475, "epoch": 0.08796031865323914, "flos": 19134031662720.0, "grad_norm": 2.89717114041641, "language_loss": 0.74710488, "learning_rate": 3.9648643632088634e-06, "loss": 0.76946187, "num_input_tokens_seen": 31302505, "step": 1463, "time_per_iteration": 2.760404348373413 }, { "auxiliary_loss_clip": 0.01191543, "auxiliary_loss_mlp": 0.01061609, "balance_loss_clip": 1.06145048, "balance_loss_mlp": 1.03680158, "epoch": 0.0880204419059071, "flos": 26064287867520.0, "grad_norm": 2.431514195311041, "language_loss": 0.83797103, "learning_rate": 3.964791644632941e-06, "loss": 0.8605026, "num_input_tokens_seen": 31323070, "step": 1464, "time_per_iteration": 2.7417759895324707 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01063475, "balance_loss_clip": 1.05683231, "balance_loss_mlp": 1.04093289, "epoch": 0.08808056515857508, "flos": 22377842115840.0, "grad_norm": 2.1775753375634963, "language_loss": 0.78104752, "learning_rate": 3.964718851551923e-06, "loss": 0.8034153, "num_input_tokens_seen": 31341880, "step": 1465, "time_per_iteration": 2.6852309703826904 }, { "auxiliary_loss_clip": 0.01199489, "auxiliary_loss_mlp": 0.01059873, "balance_loss_clip": 1.0619812, "balance_loss_mlp": 1.03791499, "epoch": 0.08814068841124305, "flos": 23185293897600.0, "grad_norm": 2.412657222564686, "language_loss": 0.85187089, "learning_rate": 3.9646459839685675e-06, "loss": 0.87446451, "num_input_tokens_seen": 31361995, "step": 1466, "time_per_iteration": 2.706264019012451 }, { "auxiliary_loss_clip": 0.01120627, "auxiliary_loss_mlp": 0.00782645, "balance_loss_clip": 1.04989958, "balance_loss_mlp": 1.00037241, "epoch": 0.08820081166391101, "flos": 25155281358720.0, "grad_norm": 1.9900601596102498, "language_loss": 0.84168816, "learning_rate": 3.964573041885641e-06, "loss": 0.86072087, "num_input_tokens_seen": 31381515, "step": 1467, "time_per_iteration": 2.8636934757232666 }, { "auxiliary_loss_clip": 0.01178935, "auxiliary_loss_mlp": 0.01055379, "balance_loss_clip": 1.05910301, "balance_loss_mlp": 1.03219247, "epoch": 0.08826093491657899, "flos": 22231685675520.0, "grad_norm": 1.660218686828999, "language_loss": 0.75506544, "learning_rate": 3.964500025305907e-06, "loss": 0.77740854, "num_input_tokens_seen": 31400345, "step": 1468, "time_per_iteration": 2.661501884460449 }, { "auxiliary_loss_clip": 0.01181261, "auxiliary_loss_mlp": 0.01054252, "balance_loss_clip": 1.0629456, "balance_loss_mlp": 1.03266358, "epoch": 0.08832105816924696, "flos": 22126826897280.0, "grad_norm": 4.868504388441724, "language_loss": 0.80322379, "learning_rate": 3.9644269342321355e-06, "loss": 0.82557893, "num_input_tokens_seen": 31419620, "step": 1469, "time_per_iteration": 2.7473137378692627 }, { "auxiliary_loss_clip": 0.01198542, "auxiliary_loss_mlp": 0.01059353, "balance_loss_clip": 1.0627017, "balance_loss_mlp": 1.03677487, "epoch": 0.08838118142191492, "flos": 17566495159680.0, "grad_norm": 2.0179242193855806, "language_loss": 0.77437651, "learning_rate": 3.9643537686670974e-06, "loss": 0.79695547, "num_input_tokens_seen": 31437970, "step": 1470, "time_per_iteration": 2.7672410011291504 }, { "auxiliary_loss_clip": 0.01193825, "auxiliary_loss_mlp": 0.01067102, "balance_loss_clip": 1.06180143, "balance_loss_mlp": 1.04281926, "epoch": 0.0884413046745829, "flos": 20777196251520.0, "grad_norm": 1.6812425162011504, "language_loss": 0.84297001, "learning_rate": 3.964280528613569e-06, "loss": 0.86557925, "num_input_tokens_seen": 31457040, "step": 1471, "time_per_iteration": 2.7584216594696045 }, { "auxiliary_loss_clip": 0.01156315, "auxiliary_loss_mlp": 0.01054307, "balance_loss_clip": 1.05682266, "balance_loss_mlp": 1.03342199, "epoch": 0.08850142792725087, "flos": 22125462180480.0, "grad_norm": 1.6938350729430058, "language_loss": 0.83321345, "learning_rate": 3.964207214074324e-06, "loss": 0.85531968, "num_input_tokens_seen": 31477520, "step": 1472, "time_per_iteration": 2.7895469665527344 }, { "auxiliary_loss_clip": 0.01176151, "auxiliary_loss_mlp": 0.01058616, "balance_loss_clip": 1.06106544, "balance_loss_mlp": 1.03529835, "epoch": 0.08856155117991883, "flos": 22418744728320.0, "grad_norm": 2.3638705809965, "language_loss": 0.82781172, "learning_rate": 3.964133825052146e-06, "loss": 0.85015941, "num_input_tokens_seen": 31495575, "step": 1473, "time_per_iteration": 2.7361483573913574 }, { "auxiliary_loss_clip": 0.01129906, "auxiliary_loss_mlp": 0.01064148, "balance_loss_clip": 1.05552769, "balance_loss_mlp": 1.04263091, "epoch": 0.0886216744325868, "flos": 29937002572800.0, "grad_norm": 1.6022277785896435, "language_loss": 0.78712153, "learning_rate": 3.964060361549816e-06, "loss": 0.80906206, "num_input_tokens_seen": 31520020, "step": 1474, "time_per_iteration": 2.894319534301758 }, { "auxiliary_loss_clip": 0.01146238, "auxiliary_loss_mlp": 0.01068131, "balance_loss_clip": 1.05575764, "balance_loss_mlp": 1.04175043, "epoch": 0.08868179768525478, "flos": 23982833525760.0, "grad_norm": 1.6120869011213488, "language_loss": 0.79030406, "learning_rate": 3.963986823570121e-06, "loss": 0.81244779, "num_input_tokens_seen": 31539265, "step": 1475, "time_per_iteration": 2.8806042671203613 }, { "auxiliary_loss_clip": 0.01191986, "auxiliary_loss_mlp": 0.01047451, "balance_loss_clip": 1.05980015, "balance_loss_mlp": 1.02478909, "epoch": 0.08874192093792274, "flos": 43177553216640.0, "grad_norm": 1.4679464237421194, "language_loss": 0.74202317, "learning_rate": 3.963913211115848e-06, "loss": 0.76441753, "num_input_tokens_seen": 31563425, "step": 1476, "time_per_iteration": 2.8381049633026123 }, { "auxiliary_loss_clip": 0.01174628, "auxiliary_loss_mlp": 0.01059934, "balance_loss_clip": 1.06217527, "balance_loss_mlp": 1.03678358, "epoch": 0.0888020441905907, "flos": 32852445868800.0, "grad_norm": 1.712954575149443, "language_loss": 0.74220836, "learning_rate": 3.9638395241897895e-06, "loss": 0.76455402, "num_input_tokens_seen": 31584525, "step": 1477, "time_per_iteration": 2.8452210426330566 }, { "auxiliary_loss_clip": 0.01191865, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.06062829, "balance_loss_mlp": 1.0278163, "epoch": 0.08886216744325869, "flos": 23149347361920.0, "grad_norm": 1.95844459768748, "language_loss": 0.87194049, "learning_rate": 3.963765762794739e-06, "loss": 0.89437139, "num_input_tokens_seen": 31603325, "step": 1478, "time_per_iteration": 2.644918203353882 }, { "auxiliary_loss_clip": 0.01176299, "auxiliary_loss_mlp": 0.01058069, "balance_loss_clip": 1.0572443, "balance_loss_mlp": 1.03546739, "epoch": 0.08892229069592665, "flos": 23331593992320.0, "grad_norm": 1.6306868156426517, "language_loss": 0.77571511, "learning_rate": 3.963691926933495e-06, "loss": 0.79805881, "num_input_tokens_seen": 31624820, "step": 1479, "time_per_iteration": 2.738168954849243 }, { "auxiliary_loss_clip": 0.01164179, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.05629039, "balance_loss_mlp": 1.02801871, "epoch": 0.08898241394859462, "flos": 26213784272640.0, "grad_norm": 2.199164032289915, "language_loss": 0.77797234, "learning_rate": 3.9636180166088555e-06, "loss": 0.80014014, "num_input_tokens_seen": 31646080, "step": 1480, "time_per_iteration": 2.837562322616577 }, { "auxiliary_loss_clip": 0.01180168, "auxiliary_loss_mlp": 0.01060894, "balance_loss_clip": 1.05762577, "balance_loss_mlp": 1.03656292, "epoch": 0.0890425372012626, "flos": 23550613171200.0, "grad_norm": 2.9471668635954273, "language_loss": 0.66437578, "learning_rate": 3.963544031823624e-06, "loss": 0.68678641, "num_input_tokens_seen": 31665770, "step": 1481, "time_per_iteration": 2.742422580718994 }, { "auxiliary_loss_clip": 0.01143445, "auxiliary_loss_mlp": 0.01055318, "balance_loss_clip": 1.05510306, "balance_loss_mlp": 1.03273988, "epoch": 0.08910266045393056, "flos": 23002795872000.0, "grad_norm": 2.124586862599894, "language_loss": 0.96630967, "learning_rate": 3.9634699725806065e-06, "loss": 0.9882974, "num_input_tokens_seen": 31683805, "step": 1482, "time_per_iteration": 2.8150243759155273 }, { "auxiliary_loss_clip": 0.0115336, "auxiliary_loss_mlp": 0.01057266, "balance_loss_clip": 1.05521989, "balance_loss_mlp": 1.03353167, "epoch": 0.08916278370659853, "flos": 31936508035200.0, "grad_norm": 1.7904792435575492, "language_loss": 0.78683239, "learning_rate": 3.96339583888261e-06, "loss": 0.80893862, "num_input_tokens_seen": 31704630, "step": 1483, "time_per_iteration": 2.869084119796753 }, { "auxiliary_loss_clip": 0.0116904, "auxiliary_loss_mlp": 0.01082082, "balance_loss_clip": 1.05540919, "balance_loss_mlp": 1.05829978, "epoch": 0.08922290695926649, "flos": 17530404969600.0, "grad_norm": 2.2229749189835677, "language_loss": 0.85424453, "learning_rate": 3.963321630732448e-06, "loss": 0.87675571, "num_input_tokens_seen": 31723255, "step": 1484, "time_per_iteration": 4.280332326889038 }, { "auxiliary_loss_clip": 0.01199312, "auxiliary_loss_mlp": 0.01060639, "balance_loss_clip": 1.06350458, "balance_loss_mlp": 1.03701186, "epoch": 0.08928303021193447, "flos": 32125075459200.0, "grad_norm": 1.7208139316694195, "language_loss": 0.80205405, "learning_rate": 3.963247348132932e-06, "loss": 0.82465357, "num_input_tokens_seen": 31747045, "step": 1485, "time_per_iteration": 2.761733055114746 }, { "auxiliary_loss_clip": 0.01173167, "auxiliary_loss_mlp": 0.01056554, "balance_loss_clip": 1.0563333, "balance_loss_mlp": 1.03228331, "epoch": 0.08934315346460243, "flos": 22125210785280.0, "grad_norm": 1.8969438127775513, "language_loss": 0.82859123, "learning_rate": 3.96317299108688e-06, "loss": 0.85088843, "num_input_tokens_seen": 31766615, "step": 1486, "time_per_iteration": 4.144649028778076 }, { "auxiliary_loss_clip": 0.01144509, "auxiliary_loss_mlp": 0.01063805, "balance_loss_clip": 1.05592823, "balance_loss_mlp": 1.04021382, "epoch": 0.0894032767172704, "flos": 22565583527040.0, "grad_norm": 2.1520807598980185, "language_loss": 0.76365155, "learning_rate": 3.963098559597111e-06, "loss": 0.78573477, "num_input_tokens_seen": 31785855, "step": 1487, "time_per_iteration": 4.432489395141602 }, { "auxiliary_loss_clip": 0.01157327, "auxiliary_loss_mlp": 0.01060261, "balance_loss_clip": 1.05041027, "balance_loss_mlp": 1.03542995, "epoch": 0.08946339996993838, "flos": 20193396503040.0, "grad_norm": 3.851280697857004, "language_loss": 0.83030224, "learning_rate": 3.963024053666449e-06, "loss": 0.85247803, "num_input_tokens_seen": 31804210, "step": 1488, "time_per_iteration": 2.7262001037597656 }, { "auxiliary_loss_clip": 0.01171869, "auxiliary_loss_mlp": 0.01051875, "balance_loss_clip": 1.05546355, "balance_loss_mlp": 1.02916527, "epoch": 0.08952352322260634, "flos": 48360181104000.0, "grad_norm": 1.7759111472560039, "language_loss": 0.71783459, "learning_rate": 3.962949473297718e-06, "loss": 0.74007201, "num_input_tokens_seen": 31826150, "step": 1489, "time_per_iteration": 4.562536954879761 }, { "auxiliary_loss_clip": 0.01150585, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.05190349, "balance_loss_mlp": 1.02830291, "epoch": 0.08958364647527431, "flos": 31793081028480.0, "grad_norm": 1.6999724957706692, "language_loss": 0.89717221, "learning_rate": 3.962874818493745e-06, "loss": 0.91919196, "num_input_tokens_seen": 31848060, "step": 1490, "time_per_iteration": 2.838327646255493 }, { "auxiliary_loss_clip": 0.01184278, "auxiliary_loss_mlp": 0.01064168, "balance_loss_clip": 1.05656135, "balance_loss_mlp": 1.04102957, "epoch": 0.08964376972794229, "flos": 23368186972800.0, "grad_norm": 3.9062133325383126, "language_loss": 0.73075998, "learning_rate": 3.9628000892573635e-06, "loss": 0.7532444, "num_input_tokens_seen": 31870040, "step": 1491, "time_per_iteration": 2.7007367610931396 }, { "auxiliary_loss_clip": 0.01189564, "auxiliary_loss_mlp": 0.00780167, "balance_loss_clip": 1.05968356, "balance_loss_mlp": 1.00023544, "epoch": 0.08970389298061025, "flos": 23294785530240.0, "grad_norm": 1.7021050418948058, "language_loss": 0.77235049, "learning_rate": 3.9627252855914055e-06, "loss": 0.79204774, "num_input_tokens_seen": 31890400, "step": 1492, "time_per_iteration": 2.7799623012542725 }, { "auxiliary_loss_clip": 0.01187114, "auxiliary_loss_mlp": 0.01057952, "balance_loss_clip": 1.05902028, "balance_loss_mlp": 1.03512359, "epoch": 0.08976401623327822, "flos": 33761703772800.0, "grad_norm": 1.9236790530591625, "language_loss": 0.71429193, "learning_rate": 3.962650407498707e-06, "loss": 0.73674262, "num_input_tokens_seen": 31913435, "step": 1493, "time_per_iteration": 2.8479840755462646 }, { "auxiliary_loss_clip": 0.01188796, "auxiliary_loss_mlp": 0.01057103, "balance_loss_clip": 1.05757976, "balance_loss_mlp": 1.03371406, "epoch": 0.08982413948594618, "flos": 23911335504000.0, "grad_norm": 2.6977604073852053, "language_loss": 0.87175488, "learning_rate": 3.962575454982109e-06, "loss": 0.8942138, "num_input_tokens_seen": 31932435, "step": 1494, "time_per_iteration": 2.855658769607544 }, { "auxiliary_loss_clip": 0.0108466, "auxiliary_loss_mlp": 0.01070478, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.04551601, "epoch": 0.08988426273861416, "flos": 16837544551680.0, "grad_norm": 1.6162523894431247, "language_loss": 0.82929438, "learning_rate": 3.962500428044454e-06, "loss": 0.85084569, "num_input_tokens_seen": 31950125, "step": 1495, "time_per_iteration": 2.9265449047088623 }, { "auxiliary_loss_clip": 0.01171464, "auxiliary_loss_mlp": 0.01059756, "balance_loss_clip": 1.05779243, "balance_loss_mlp": 1.03682017, "epoch": 0.08994438599128213, "flos": 14793365548800.0, "grad_norm": 9.387255385257733, "language_loss": 0.70191383, "learning_rate": 3.962425326688585e-06, "loss": 0.72422606, "num_input_tokens_seen": 31968050, "step": 1496, "time_per_iteration": 2.773693799972534 }, { "auxiliary_loss_clip": 0.01164171, "auxiliary_loss_mlp": 0.01049454, "balance_loss_clip": 1.05397439, "balance_loss_mlp": 1.02888989, "epoch": 0.09000450924395009, "flos": 17384320356480.0, "grad_norm": 1.6327835891742186, "language_loss": 0.79752576, "learning_rate": 3.962350150917351e-06, "loss": 0.81966203, "num_input_tokens_seen": 31985675, "step": 1497, "time_per_iteration": 2.6850852966308594 }, { "auxiliary_loss_clip": 0.01129609, "auxiliary_loss_mlp": 0.01054903, "balance_loss_clip": 1.05307686, "balance_loss_mlp": 1.03146648, "epoch": 0.09006463249661807, "flos": 24280317964800.0, "grad_norm": 8.517000212139891, "language_loss": 0.82940567, "learning_rate": 3.9622749007336035e-06, "loss": 0.85125089, "num_input_tokens_seen": 32005180, "step": 1498, "time_per_iteration": 2.786205768585205 }, { "auxiliary_loss_clip": 0.01170006, "auxiliary_loss_mlp": 0.01059397, "balance_loss_clip": 1.0577898, "balance_loss_mlp": 1.03718853, "epoch": 0.09012475574928604, "flos": 13661928069120.0, "grad_norm": 2.220597323082783, "language_loss": 0.78609937, "learning_rate": 3.962199576140195e-06, "loss": 0.80839342, "num_input_tokens_seen": 32022970, "step": 1499, "time_per_iteration": 2.71785831451416 }, { "auxiliary_loss_clip": 0.01161539, "auxiliary_loss_mlp": 0.00780528, "balance_loss_clip": 1.05444527, "balance_loss_mlp": 1.00024021, "epoch": 0.090184879001954, "flos": 23327751237120.0, "grad_norm": 2.049001350461653, "language_loss": 0.93337607, "learning_rate": 3.962124177139981e-06, "loss": 0.95279682, "num_input_tokens_seen": 32043055, "step": 1500, "time_per_iteration": 2.7077536582946777 }, { "auxiliary_loss_clip": 0.01148009, "auxiliary_loss_mlp": 0.01055246, "balance_loss_clip": 1.05371249, "balance_loss_mlp": 1.0308435, "epoch": 0.09024500225462198, "flos": 23002688131200.0, "grad_norm": 3.0778515668575492, "language_loss": 0.74595469, "learning_rate": 3.962048703735822e-06, "loss": 0.76798725, "num_input_tokens_seen": 32061900, "step": 1501, "time_per_iteration": 2.7073416709899902 }, { "auxiliary_loss_clip": 0.01056535, "auxiliary_loss_mlp": 0.01013118, "balance_loss_clip": 1.03392363, "balance_loss_mlp": 1.00963676, "epoch": 0.09030512550728995, "flos": 62189203242240.0, "grad_norm": 0.7274487593473578, "language_loss": 0.58316052, "learning_rate": 3.96197315593058e-06, "loss": 0.60385704, "num_input_tokens_seen": 32122745, "step": 1502, "time_per_iteration": 3.274049997329712 }, { "auxiliary_loss_clip": 0.0114469, "auxiliary_loss_mlp": 0.01062533, "balance_loss_clip": 1.04626393, "balance_loss_mlp": 1.03896546, "epoch": 0.09036524875995791, "flos": 38800689171840.0, "grad_norm": 2.1727281711500095, "language_loss": 0.69501173, "learning_rate": 3.961897533727119e-06, "loss": 0.71708393, "num_input_tokens_seen": 32145125, "step": 1503, "time_per_iteration": 2.87554669380188 }, { "auxiliary_loss_clip": 0.01133108, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.04783726, "balance_loss_mlp": 1.03660655, "epoch": 0.09042537201262588, "flos": 21690081429120.0, "grad_norm": 2.169205134580129, "language_loss": 0.86124271, "learning_rate": 3.961821837128306e-06, "loss": 0.88316405, "num_input_tokens_seen": 32166255, "step": 1504, "time_per_iteration": 2.844688892364502 }, { "auxiliary_loss_clip": 0.01146301, "auxiliary_loss_mlp": 0.01069714, "balance_loss_clip": 1.05341232, "balance_loss_mlp": 1.04261804, "epoch": 0.09048549526529386, "flos": 22267021680000.0, "grad_norm": 2.178155372989796, "language_loss": 0.7233696, "learning_rate": 3.961746066137014e-06, "loss": 0.74552977, "num_input_tokens_seen": 32184010, "step": 1505, "time_per_iteration": 2.7992677688598633 }, { "auxiliary_loss_clip": 0.01137399, "auxiliary_loss_mlp": 0.01056414, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 1.03302479, "epoch": 0.09054561851796182, "flos": 14610939350400.0, "grad_norm": 2.5107188210784526, "language_loss": 0.80730999, "learning_rate": 3.961670220756114e-06, "loss": 0.82924813, "num_input_tokens_seen": 32201635, "step": 1506, "time_per_iteration": 2.7458760738372803 }, { "auxiliary_loss_clip": 0.01140643, "auxiliary_loss_mlp": 0.01053315, "balance_loss_clip": 1.05161858, "balance_loss_mlp": 1.03197718, "epoch": 0.09060574177062979, "flos": 27636169916160.0, "grad_norm": 2.166956120197676, "language_loss": 0.75915337, "learning_rate": 3.961594300988482e-06, "loss": 0.78109294, "num_input_tokens_seen": 32221940, "step": 1507, "time_per_iteration": 2.873826742172241 }, { "auxiliary_loss_clip": 0.01051873, "auxiliary_loss_mlp": 0.01005715, "balance_loss_clip": 1.02043629, "balance_loss_mlp": 1.00175714, "epoch": 0.09066586502329776, "flos": 66085797513600.0, "grad_norm": 0.7272435825555993, "language_loss": 0.57699698, "learning_rate": 3.961518306836998e-06, "loss": 0.59757286, "num_input_tokens_seen": 32276495, "step": 1508, "time_per_iteration": 3.064926862716675 }, { "auxiliary_loss_clip": 0.01165416, "auxiliary_loss_mlp": 0.01054804, "balance_loss_clip": 1.055233, "balance_loss_mlp": 1.03155804, "epoch": 0.09072598827596573, "flos": 18916449027840.0, "grad_norm": 1.7601330807914457, "language_loss": 0.85090744, "learning_rate": 3.961442238304543e-06, "loss": 0.87310958, "num_input_tokens_seen": 32294130, "step": 1509, "time_per_iteration": 2.6664113998413086 }, { "auxiliary_loss_clip": 0.01168837, "auxiliary_loss_mlp": 0.01064138, "balance_loss_clip": 1.05745769, "balance_loss_mlp": 1.03949761, "epoch": 0.0907861115286337, "flos": 24821742643200.0, "grad_norm": 2.3794507710009203, "language_loss": 0.84110659, "learning_rate": 3.961366095394002e-06, "loss": 0.8634364, "num_input_tokens_seen": 32313555, "step": 1510, "time_per_iteration": 2.783484697341919 }, { "auxiliary_loss_clip": 0.01153141, "auxiliary_loss_mlp": 0.01058569, "balance_loss_clip": 1.05423617, "balance_loss_mlp": 1.03482211, "epoch": 0.09084623478130167, "flos": 21652842003840.0, "grad_norm": 1.8490761573484715, "language_loss": 0.85247588, "learning_rate": 3.961289878108262e-06, "loss": 0.87459302, "num_input_tokens_seen": 32331430, "step": 1511, "time_per_iteration": 2.714620351791382 }, { "auxiliary_loss_clip": 0.01145395, "auxiliary_loss_mlp": 0.01052919, "balance_loss_clip": 1.05182219, "balance_loss_mlp": 1.02983987, "epoch": 0.09090635803396964, "flos": 27639258485760.0, "grad_norm": 1.5734326837562458, "language_loss": 0.84977764, "learning_rate": 3.9612135864502135e-06, "loss": 0.87176073, "num_input_tokens_seen": 32353705, "step": 1512, "time_per_iteration": 2.75361704826355 }, { "auxiliary_loss_clip": 0.01155239, "auxiliary_loss_mlp": 0.01053669, "balance_loss_clip": 1.05740952, "balance_loss_mlp": 1.03185391, "epoch": 0.0909664812866376, "flos": 17669127294720.0, "grad_norm": 3.0235926431973654, "language_loss": 0.87346804, "learning_rate": 3.961137220422749e-06, "loss": 0.89555705, "num_input_tokens_seen": 32370520, "step": 1513, "time_per_iteration": 2.6864211559295654 }, { "auxiliary_loss_clip": 0.01168585, "auxiliary_loss_mlp": 0.01049408, "balance_loss_clip": 1.05562937, "balance_loss_mlp": 1.02841544, "epoch": 0.09102660453930557, "flos": 23951448017280.0, "grad_norm": 1.7883280971870592, "language_loss": 0.86802679, "learning_rate": 3.961060780028764e-06, "loss": 0.89020675, "num_input_tokens_seen": 32389105, "step": 1514, "time_per_iteration": 2.6788065433502197 }, { "auxiliary_loss_clip": 0.01134005, "auxiliary_loss_mlp": 0.01064386, "balance_loss_clip": 1.05571628, "balance_loss_mlp": 1.04252315, "epoch": 0.09108672779197355, "flos": 25812949426560.0, "grad_norm": 1.7666120550996132, "language_loss": 0.89944756, "learning_rate": 3.960984265271159e-06, "loss": 0.92143154, "num_input_tokens_seen": 32408065, "step": 1515, "time_per_iteration": 2.757390022277832 }, { "auxiliary_loss_clip": 0.01162518, "auxiliary_loss_mlp": 0.01056937, "balance_loss_clip": 1.05547726, "balance_loss_mlp": 1.03360808, "epoch": 0.09114685104464151, "flos": 29639482220160.0, "grad_norm": 2.1090985009837646, "language_loss": 0.85576892, "learning_rate": 3.9609076761528335e-06, "loss": 0.87796342, "num_input_tokens_seen": 32427225, "step": 1516, "time_per_iteration": 2.704784870147705 }, { "auxiliary_loss_clip": 0.01158781, "auxiliary_loss_mlp": 0.01057165, "balance_loss_clip": 1.05135357, "balance_loss_mlp": 1.03451526, "epoch": 0.09120697429730948, "flos": 33729635905920.0, "grad_norm": 2.086405156201108, "language_loss": 0.81167233, "learning_rate": 3.960831012676692e-06, "loss": 0.83383185, "num_input_tokens_seen": 32450510, "step": 1517, "time_per_iteration": 2.8586854934692383 }, { "auxiliary_loss_clip": 0.0117857, "auxiliary_loss_mlp": 0.01065492, "balance_loss_clip": 1.05741739, "balance_loss_mlp": 1.04280686, "epoch": 0.09126709754997746, "flos": 18401381953920.0, "grad_norm": 2.104468567304263, "language_loss": 0.78067243, "learning_rate": 3.960754274845642e-06, "loss": 0.80311304, "num_input_tokens_seen": 32468425, "step": 1518, "time_per_iteration": 2.7862088680267334 }, { "auxiliary_loss_clip": 0.01165395, "auxiliary_loss_mlp": 0.01061371, "balance_loss_clip": 1.05285823, "balance_loss_mlp": 1.03900695, "epoch": 0.09132722080264542, "flos": 22091957769600.0, "grad_norm": 1.6816479812467473, "language_loss": 0.86124098, "learning_rate": 3.960677462662594e-06, "loss": 0.88350856, "num_input_tokens_seen": 32487510, "step": 1519, "time_per_iteration": 2.723714828491211 }, { "auxiliary_loss_clip": 0.01163599, "auxiliary_loss_mlp": 0.01052792, "balance_loss_clip": 1.05454183, "balance_loss_mlp": 1.02914131, "epoch": 0.09138734405531339, "flos": 21033131633280.0, "grad_norm": 1.9681293960876167, "language_loss": 0.73279071, "learning_rate": 3.96060057613046e-06, "loss": 0.75495458, "num_input_tokens_seen": 32507250, "step": 1520, "time_per_iteration": 2.8098628520965576 }, { "auxiliary_loss_clip": 0.01161166, "auxiliary_loss_mlp": 0.01058035, "balance_loss_clip": 1.05696058, "balance_loss_mlp": 1.03469419, "epoch": 0.09144746730798137, "flos": 20083940784000.0, "grad_norm": 2.6988457876937066, "language_loss": 0.85236609, "learning_rate": 3.960523615252156e-06, "loss": 0.87455815, "num_input_tokens_seen": 32526045, "step": 1521, "time_per_iteration": 2.7134172916412354 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01063979, "balance_loss_clip": 1.05189717, "balance_loss_mlp": 1.03991079, "epoch": 0.09150759056064933, "flos": 22778210085120.0, "grad_norm": 1.6991603177293335, "language_loss": 0.83933008, "learning_rate": 3.960446580030599e-06, "loss": 0.8611716, "num_input_tokens_seen": 32546575, "step": 1522, "time_per_iteration": 2.93745493888855 }, { "auxiliary_loss_clip": 0.01182362, "auxiliary_loss_mlp": 0.01064589, "balance_loss_clip": 1.05630755, "balance_loss_mlp": 1.04153395, "epoch": 0.0915677138133173, "flos": 27564205017600.0, "grad_norm": 1.647915064434875, "language_loss": 0.81012994, "learning_rate": 3.960369470468711e-06, "loss": 0.8325994, "num_input_tokens_seen": 32568795, "step": 1523, "time_per_iteration": 4.378152847290039 }, { "auxiliary_loss_clip": 0.01157976, "auxiliary_loss_mlp": 0.00781395, "balance_loss_clip": 1.05422449, "balance_loss_mlp": 1.00037968, "epoch": 0.09162783706598528, "flos": 17674765729920.0, "grad_norm": 2.106497620262502, "language_loss": 0.7460072, "learning_rate": 3.960292286569418e-06, "loss": 0.76540089, "num_input_tokens_seen": 32587010, "step": 1524, "time_per_iteration": 2.7146124839782715 }, { "auxiliary_loss_clip": 0.01135228, "auxiliary_loss_mlp": 0.0106119, "balance_loss_clip": 1.05092478, "balance_loss_mlp": 1.03782487, "epoch": 0.09168796031865324, "flos": 18478195188480.0, "grad_norm": 2.0992608845945413, "language_loss": 0.86498803, "learning_rate": 3.960215028335644e-06, "loss": 0.88695222, "num_input_tokens_seen": 32602375, "step": 1525, "time_per_iteration": 4.314826965332031 }, { "auxiliary_loss_clip": 0.01164396, "auxiliary_loss_mlp": 0.01049506, "balance_loss_clip": 1.05688822, "balance_loss_mlp": 1.0263319, "epoch": 0.0917480835713212, "flos": 29387605075200.0, "grad_norm": 2.1146348399758237, "language_loss": 0.74512708, "learning_rate": 3.96013769577032e-06, "loss": 0.76726609, "num_input_tokens_seen": 32621460, "step": 1526, "time_per_iteration": 5.878855466842651 }, { "auxiliary_loss_clip": 0.01186002, "auxiliary_loss_mlp": 0.01055817, "balance_loss_clip": 1.05732703, "balance_loss_mlp": 1.03392982, "epoch": 0.09180820682398917, "flos": 19829262378240.0, "grad_norm": 2.5135282962071215, "language_loss": 0.77581728, "learning_rate": 3.960060288876378e-06, "loss": 0.79823542, "num_input_tokens_seen": 32640440, "step": 1527, "time_per_iteration": 2.693847179412842 }, { "auxiliary_loss_clip": 0.01173605, "auxiliary_loss_mlp": 0.01052264, "balance_loss_clip": 1.0534333, "balance_loss_mlp": 1.02868414, "epoch": 0.09186833007665715, "flos": 23841848643840.0, "grad_norm": 2.655631139677705, "language_loss": 0.78546697, "learning_rate": 3.959982807656753e-06, "loss": 0.80772561, "num_input_tokens_seen": 32660020, "step": 1528, "time_per_iteration": 2.774219512939453 }, { "auxiliary_loss_clip": 0.01146017, "auxiliary_loss_mlp": 0.01050376, "balance_loss_clip": 1.0499053, "balance_loss_mlp": 1.02827477, "epoch": 0.09192845332932512, "flos": 12932726065920.0, "grad_norm": 2.682547324044482, "language_loss": 0.76732361, "learning_rate": 3.959905252114384e-06, "loss": 0.78928751, "num_input_tokens_seen": 32678170, "step": 1529, "time_per_iteration": 4.603156089782715 }, { "auxiliary_loss_clip": 0.01186538, "auxiliary_loss_mlp": 0.00780856, "balance_loss_clip": 1.05415928, "balance_loss_mlp": 1.00045025, "epoch": 0.09198857658199308, "flos": 24568177559040.0, "grad_norm": 1.7410660090049153, "language_loss": 0.82906747, "learning_rate": 3.959827622252211e-06, "loss": 0.84874141, "num_input_tokens_seen": 32697540, "step": 1530, "time_per_iteration": 2.7118582725524902 }, { "auxiliary_loss_clip": 0.01130108, "auxiliary_loss_mlp": 0.0106509, "balance_loss_clip": 1.04975331, "balance_loss_mlp": 1.04220152, "epoch": 0.09204869983466106, "flos": 20266941600000.0, "grad_norm": 2.182960664479704, "language_loss": 0.84001881, "learning_rate": 3.959749918073179e-06, "loss": 0.86197078, "num_input_tokens_seen": 32716805, "step": 1531, "time_per_iteration": 2.791947603225708 }, { "auxiliary_loss_clip": 0.0113655, "auxiliary_loss_mlp": 0.01051554, "balance_loss_clip": 1.04906452, "balance_loss_mlp": 1.02853465, "epoch": 0.09210882308732903, "flos": 20885646389760.0, "grad_norm": 1.7570281394880602, "language_loss": 0.81253195, "learning_rate": 3.959672139580233e-06, "loss": 0.83441293, "num_input_tokens_seen": 32736385, "step": 1532, "time_per_iteration": 2.737739324569702 }, { "auxiliary_loss_clip": 0.01157728, "auxiliary_loss_mlp": 0.01056753, "balance_loss_clip": 1.052163, "balance_loss_mlp": 1.03385305, "epoch": 0.09216894633999699, "flos": 30956326727040.0, "grad_norm": 2.2821036564882182, "language_loss": 0.84194255, "learning_rate": 3.9595942867763235e-06, "loss": 0.86408734, "num_input_tokens_seen": 32757140, "step": 1533, "time_per_iteration": 2.7542598247528076 }, { "auxiliary_loss_clip": 0.01149262, "auxiliary_loss_mlp": 0.01053623, "balance_loss_clip": 1.05813503, "balance_loss_mlp": 1.03190327, "epoch": 0.09222906959266497, "flos": 13151565676800.0, "grad_norm": 1.9396914937933663, "language_loss": 0.9009546, "learning_rate": 3.959516359664402e-06, "loss": 0.92298347, "num_input_tokens_seen": 32774860, "step": 1534, "time_per_iteration": 2.6450984477996826 }, { "auxiliary_loss_clip": 0.01150273, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.0495038, "balance_loss_mlp": 1.03849435, "epoch": 0.09228919284533293, "flos": 25994477784960.0, "grad_norm": 5.065477266086046, "language_loss": 0.75779241, "learning_rate": 3.959438358247424e-06, "loss": 0.77992499, "num_input_tokens_seen": 32795250, "step": 1535, "time_per_iteration": 2.730915069580078 }, { "auxiliary_loss_clip": 0.01168283, "auxiliary_loss_mlp": 0.01045276, "balance_loss_clip": 1.05278873, "balance_loss_mlp": 1.02403271, "epoch": 0.0923493160980009, "flos": 18660800954880.0, "grad_norm": 1.8085584532497372, "language_loss": 0.81631637, "learning_rate": 3.959360282528346e-06, "loss": 0.83845198, "num_input_tokens_seen": 32813805, "step": 1536, "time_per_iteration": 2.7326817512512207 }, { "auxiliary_loss_clip": 0.01181977, "auxiliary_loss_mlp": 0.01053699, "balance_loss_clip": 1.05431938, "balance_loss_mlp": 1.03224182, "epoch": 0.09240943935066886, "flos": 21140576190720.0, "grad_norm": 2.0929096884707556, "language_loss": 0.89092755, "learning_rate": 3.959282132510131e-06, "loss": 0.9132843, "num_input_tokens_seen": 32830960, "step": 1537, "time_per_iteration": 2.675771713256836 }, { "auxiliary_loss_clip": 0.01157238, "auxiliary_loss_mlp": 0.01058647, "balance_loss_clip": 1.05114293, "balance_loss_mlp": 1.03605688, "epoch": 0.09246956260333684, "flos": 20592435669120.0, "grad_norm": 1.9480116987165197, "language_loss": 0.80702311, "learning_rate": 3.959203908195741e-06, "loss": 0.82918191, "num_input_tokens_seen": 32848275, "step": 1538, "time_per_iteration": 2.71618390083313 }, { "auxiliary_loss_clip": 0.01060495, "auxiliary_loss_mlp": 0.0101237, "balance_loss_clip": 1.03095436, "balance_loss_mlp": 1.00872231, "epoch": 0.09252968585600481, "flos": 67558710614400.0, "grad_norm": 0.7534074452314953, "language_loss": 0.57429332, "learning_rate": 3.959125609588142e-06, "loss": 0.59502202, "num_input_tokens_seen": 32917730, "step": 1539, "time_per_iteration": 3.3933441638946533 }, { "auxiliary_loss_clip": 0.01159831, "auxiliary_loss_mlp": 0.01050602, "balance_loss_clip": 1.05638027, "balance_loss_mlp": 1.02863121, "epoch": 0.09258980910867277, "flos": 17383853479680.0, "grad_norm": 2.849299216868502, "language_loss": 0.67554641, "learning_rate": 3.959047236690304e-06, "loss": 0.69765073, "num_input_tokens_seen": 32934910, "step": 1540, "time_per_iteration": 2.757084608078003 }, { "auxiliary_loss_clip": 0.01144239, "auxiliary_loss_mlp": 0.01048444, "balance_loss_clip": 1.04954028, "balance_loss_mlp": 1.026438, "epoch": 0.09264993236134075, "flos": 19865927185920.0, "grad_norm": 2.044335478602743, "language_loss": 0.83917534, "learning_rate": 3.958968789505198e-06, "loss": 0.86110216, "num_input_tokens_seen": 32953840, "step": 1541, "time_per_iteration": 2.8497180938720703 }, { "auxiliary_loss_clip": 0.01077839, "auxiliary_loss_mlp": 0.01013078, "balance_loss_clip": 1.02602255, "balance_loss_mlp": 1.0097636, "epoch": 0.09271005561400872, "flos": 62284401262080.0, "grad_norm": 0.8790732834061692, "language_loss": 0.61881655, "learning_rate": 3.9588902680358e-06, "loss": 0.63972563, "num_input_tokens_seen": 33011410, "step": 1542, "time_per_iteration": 3.3079330921173096 }, { "auxiliary_loss_clip": 0.01161232, "auxiliary_loss_mlp": 0.01059438, "balance_loss_clip": 1.05441117, "balance_loss_mlp": 1.03808808, "epoch": 0.09277017886667668, "flos": 23329870139520.0, "grad_norm": 1.6256118826429122, "language_loss": 0.82802349, "learning_rate": 3.958811672285086e-06, "loss": 0.85023022, "num_input_tokens_seen": 33031675, "step": 1543, "time_per_iteration": 2.7408807277679443 }, { "auxiliary_loss_clip": 0.01135873, "auxiliary_loss_mlp": 0.01060295, "balance_loss_clip": 1.04848838, "balance_loss_mlp": 1.03863442, "epoch": 0.09283030211934466, "flos": 54745169875200.0, "grad_norm": 1.706948475246468, "language_loss": 0.72265279, "learning_rate": 3.958733002256038e-06, "loss": 0.74461448, "num_input_tokens_seen": 33056355, "step": 1544, "time_per_iteration": 3.104156255722046 }, { "auxiliary_loss_clip": 0.01166071, "auxiliary_loss_mlp": 0.01055881, "balance_loss_clip": 1.05165935, "balance_loss_mlp": 1.03138375, "epoch": 0.09289042537201263, "flos": 30334784762880.0, "grad_norm": 1.7720844214030114, "language_loss": 0.77286768, "learning_rate": 3.958654257951637e-06, "loss": 0.79508722, "num_input_tokens_seen": 33079520, "step": 1545, "time_per_iteration": 2.808180570602417 }, { "auxiliary_loss_clip": 0.01140161, "auxiliary_loss_mlp": 0.01050495, "balance_loss_clip": 1.0526737, "balance_loss_mlp": 1.02872682, "epoch": 0.09295054862468059, "flos": 17746838369280.0, "grad_norm": 2.7089619481030076, "language_loss": 0.74396008, "learning_rate": 3.9585754393748706e-06, "loss": 0.76586664, "num_input_tokens_seen": 33096135, "step": 1546, "time_per_iteration": 2.7634081840515137 }, { "auxiliary_loss_clip": 0.01163775, "auxiliary_loss_mlp": 0.0105305, "balance_loss_clip": 1.05357957, "balance_loss_mlp": 1.02956545, "epoch": 0.09301067187734856, "flos": 23658021815040.0, "grad_norm": 1.9423225100503794, "language_loss": 0.84200966, "learning_rate": 3.9584965465287275e-06, "loss": 0.86417794, "num_input_tokens_seen": 33115245, "step": 1547, "time_per_iteration": 2.790003776550293 }, { "auxiliary_loss_clip": 0.01141839, "auxiliary_loss_mlp": 0.01053941, "balance_loss_clip": 1.04740989, "balance_loss_mlp": 1.03195918, "epoch": 0.09307079513001654, "flos": 27527719777920.0, "grad_norm": 2.6545433694843488, "language_loss": 0.67698336, "learning_rate": 3.958417579416199e-06, "loss": 0.69894123, "num_input_tokens_seen": 33136640, "step": 1548, "time_per_iteration": 2.8367013931274414 }, { "auxiliary_loss_clip": 0.01123899, "auxiliary_loss_mlp": 0.01059885, "balance_loss_clip": 1.04744387, "balance_loss_mlp": 1.03754544, "epoch": 0.0931309183826845, "flos": 20627340710400.0, "grad_norm": 1.6829727803454704, "language_loss": 0.8326273, "learning_rate": 3.9583385380402795e-06, "loss": 0.85446513, "num_input_tokens_seen": 33155060, "step": 1549, "time_per_iteration": 2.8462016582489014 }, { "auxiliary_loss_clip": 0.01176243, "auxiliary_loss_mlp": 0.0104617, "balance_loss_clip": 1.05815506, "balance_loss_mlp": 1.02473652, "epoch": 0.09319104163535247, "flos": 29020921084800.0, "grad_norm": 1.5528514681372962, "language_loss": 0.75838119, "learning_rate": 3.958259422403966e-06, "loss": 0.78060532, "num_input_tokens_seen": 33175420, "step": 1550, "time_per_iteration": 2.7325351238250732 }, { "auxiliary_loss_clip": 0.01150315, "auxiliary_loss_mlp": 0.01069257, "balance_loss_clip": 1.05249369, "balance_loss_mlp": 1.04483092, "epoch": 0.09325116488802045, "flos": 25301545539840.0, "grad_norm": 2.1922696027472233, "language_loss": 0.82828665, "learning_rate": 3.95818023251026e-06, "loss": 0.85048234, "num_input_tokens_seen": 33194120, "step": 1551, "time_per_iteration": 2.852602481842041 }, { "auxiliary_loss_clip": 0.01064371, "auxiliary_loss_mlp": 0.00760109, "balance_loss_clip": 1.02203059, "balance_loss_mlp": 0.99984246, "epoch": 0.09331128814068841, "flos": 61536203942400.0, "grad_norm": 0.7384225982202158, "language_loss": 0.61837572, "learning_rate": 3.958100968362163e-06, "loss": 0.63662052, "num_input_tokens_seen": 33261080, "step": 1552, "time_per_iteration": 3.3453099727630615 }, { "auxiliary_loss_clip": 0.01059175, "auxiliary_loss_mlp": 0.01016654, "balance_loss_clip": 1.02415061, "balance_loss_mlp": 1.01338792, "epoch": 0.09337141139335638, "flos": 53293700171520.0, "grad_norm": 0.8524917480784928, "language_loss": 0.58986926, "learning_rate": 3.958021629962681e-06, "loss": 0.61062753, "num_input_tokens_seen": 33330235, "step": 1553, "time_per_iteration": 3.37673282623291 }, { "auxiliary_loss_clip": 0.01146955, "auxiliary_loss_mlp": 0.01056683, "balance_loss_clip": 1.05026984, "balance_loss_mlp": 1.03336585, "epoch": 0.09343153464602436, "flos": 23476852592640.0, "grad_norm": 2.3365109182487, "language_loss": 0.87665397, "learning_rate": 3.957942217314823e-06, "loss": 0.8986904, "num_input_tokens_seen": 33349035, "step": 1554, "time_per_iteration": 2.8098127841949463 }, { "auxiliary_loss_clip": 0.01153047, "auxiliary_loss_mlp": 0.01057257, "balance_loss_clip": 1.05439448, "balance_loss_mlp": 1.03393972, "epoch": 0.09349165789869232, "flos": 19353481804800.0, "grad_norm": 4.388884220182432, "language_loss": 0.81678319, "learning_rate": 3.957862730421599e-06, "loss": 0.83888626, "num_input_tokens_seen": 33368060, "step": 1555, "time_per_iteration": 2.726207971572876 }, { "auxiliary_loss_clip": 0.01058869, "auxiliary_loss_mlp": 0.01003892, "balance_loss_clip": 1.0202632, "balance_loss_mlp": 1.00045919, "epoch": 0.09355178115136029, "flos": 67502580635520.0, "grad_norm": 0.8683826280274983, "language_loss": 0.59606886, "learning_rate": 3.957783169286024e-06, "loss": 0.61669648, "num_input_tokens_seen": 33430825, "step": 1556, "time_per_iteration": 3.209326982498169 }, { "auxiliary_loss_clip": 0.01174249, "auxiliary_loss_mlp": 0.01059741, "balance_loss_clip": 1.05518138, "balance_loss_mlp": 1.03727031, "epoch": 0.09361190440402825, "flos": 37341638720640.0, "grad_norm": 1.6803158790244075, "language_loss": 0.84290808, "learning_rate": 3.9577035339111155e-06, "loss": 0.86524796, "num_input_tokens_seen": 33454855, "step": 1557, "time_per_iteration": 2.831650733947754 }, { "auxiliary_loss_clip": 0.01110857, "auxiliary_loss_mlp": 0.01065156, "balance_loss_clip": 1.04900038, "balance_loss_mlp": 1.04112351, "epoch": 0.09367202765669623, "flos": 24899705112960.0, "grad_norm": 1.6725809358966677, "language_loss": 0.780913, "learning_rate": 3.957623824299893e-06, "loss": 0.8026731, "num_input_tokens_seen": 33476000, "step": 1558, "time_per_iteration": 3.0111780166625977 }, { "auxiliary_loss_clip": 0.01164994, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.0558666, "balance_loss_mlp": 1.02881753, "epoch": 0.0937321509093642, "flos": 15705568368000.0, "grad_norm": 2.0141986314124414, "language_loss": 0.80066288, "learning_rate": 3.957544040455379e-06, "loss": 0.82282507, "num_input_tokens_seen": 33493845, "step": 1559, "time_per_iteration": 3.024117946624756 }, { "auxiliary_loss_clip": 0.01141277, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.05060387, "balance_loss_mlp": 1.04012942, "epoch": 0.09379227416203216, "flos": 20483698222080.0, "grad_norm": 1.8358373674042003, "language_loss": 0.76418209, "learning_rate": 3.957464182380599e-06, "loss": 0.78621197, "num_input_tokens_seen": 33510850, "step": 1560, "time_per_iteration": 2.68558406829834 }, { "auxiliary_loss_clip": 0.01137939, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.05014277, "balance_loss_mlp": 1.03213274, "epoch": 0.09385239741470014, "flos": 24352498344960.0, "grad_norm": 3.575155933252121, "language_loss": 0.80784953, "learning_rate": 3.95738425007858e-06, "loss": 0.82977819, "num_input_tokens_seen": 33530430, "step": 1561, "time_per_iteration": 2.759148359298706 }, { "auxiliary_loss_clip": 0.01173652, "auxiliary_loss_mlp": 0.01052448, "balance_loss_clip": 1.05276573, "balance_loss_mlp": 1.02989376, "epoch": 0.0939125206673681, "flos": 33291489807360.0, "grad_norm": 2.448664627367939, "language_loss": 0.6140722, "learning_rate": 3.957304243552354e-06, "loss": 0.63633323, "num_input_tokens_seen": 33551975, "step": 1562, "time_per_iteration": 2.9014978408813477 }, { "auxiliary_loss_clip": 0.01162693, "auxiliary_loss_mlp": 0.0106374, "balance_loss_clip": 1.05719543, "balance_loss_mlp": 1.04213953, "epoch": 0.09397264392003607, "flos": 19244923925760.0, "grad_norm": 3.5098220300578555, "language_loss": 0.8496151, "learning_rate": 3.957224162804956e-06, "loss": 0.87187934, "num_input_tokens_seen": 33569850, "step": 1563, "time_per_iteration": 4.404061555862427 }, { "auxiliary_loss_clip": 0.01164811, "auxiliary_loss_mlp": 0.01047932, "balance_loss_clip": 1.05775142, "balance_loss_mlp": 1.02652228, "epoch": 0.09403276717270405, "flos": 19317930318720.0, "grad_norm": 1.6765528861156813, "language_loss": 0.76511294, "learning_rate": 3.9571440078394205e-06, "loss": 0.78724039, "num_input_tokens_seen": 33590510, "step": 1564, "time_per_iteration": 4.255565166473389 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01063256, "balance_loss_clip": 1.05196142, "balance_loss_mlp": 1.04172707, "epoch": 0.09409289042537201, "flos": 23583471137280.0, "grad_norm": 1.9762038777899962, "language_loss": 0.80134326, "learning_rate": 3.9570637786587895e-06, "loss": 0.82348871, "num_input_tokens_seen": 33608810, "step": 1565, "time_per_iteration": 2.8548545837402344 }, { "auxiliary_loss_clip": 0.01158602, "auxiliary_loss_mlp": 0.01063767, "balance_loss_clip": 1.05420566, "balance_loss_mlp": 1.04233313, "epoch": 0.09415301367803998, "flos": 20078446003200.0, "grad_norm": 1.6810250981626251, "language_loss": 0.75134379, "learning_rate": 3.956983475266103e-06, "loss": 0.77356744, "num_input_tokens_seen": 33627265, "step": 1566, "time_per_iteration": 4.889045715332031 }, { "auxiliary_loss_clip": 0.01145856, "auxiliary_loss_mlp": 0.00780689, "balance_loss_clip": 1.05168366, "balance_loss_mlp": 1.00022864, "epoch": 0.09421313693070796, "flos": 21062075016960.0, "grad_norm": 1.6828919748843199, "language_loss": 0.77958012, "learning_rate": 3.956903097664407e-06, "loss": 0.79884553, "num_input_tokens_seen": 33644810, "step": 1567, "time_per_iteration": 4.445765972137451 }, { "auxiliary_loss_clip": 0.01156815, "auxiliary_loss_mlp": 0.01056228, "balance_loss_clip": 1.05256855, "balance_loss_mlp": 1.03591454, "epoch": 0.09427326018337592, "flos": 24316156759680.0, "grad_norm": 2.008686295040646, "language_loss": 0.82608044, "learning_rate": 3.956822645856749e-06, "loss": 0.84821093, "num_input_tokens_seen": 33665665, "step": 1568, "time_per_iteration": 2.881535768508911 }, { "auxiliary_loss_clip": 0.01187915, "auxiliary_loss_mlp": 0.01051731, "balance_loss_clip": 1.05717778, "balance_loss_mlp": 1.02927184, "epoch": 0.09433338343604389, "flos": 20263888944000.0, "grad_norm": 1.9573151026586577, "language_loss": 0.76943743, "learning_rate": 3.9567421198461814e-06, "loss": 0.79183388, "num_input_tokens_seen": 33684760, "step": 1569, "time_per_iteration": 2.6097726821899414 }, { "auxiliary_loss_clip": 0.01120191, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.04771852, "balance_loss_mlp": 1.03625941, "epoch": 0.09439350668871185, "flos": 12742973493120.0, "grad_norm": 3.3813700161908917, "language_loss": 0.85488856, "learning_rate": 3.956661519635756e-06, "loss": 0.87669849, "num_input_tokens_seen": 33700750, "step": 1570, "time_per_iteration": 2.7571377754211426 }, { "auxiliary_loss_clip": 0.01122458, "auxiliary_loss_mlp": 0.01055939, "balance_loss_clip": 1.04927301, "balance_loss_mlp": 1.03183508, "epoch": 0.09445362994137983, "flos": 25962266263680.0, "grad_norm": 1.540414635950846, "language_loss": 0.76415235, "learning_rate": 3.95658084522853e-06, "loss": 0.7859363, "num_input_tokens_seen": 33724430, "step": 1571, "time_per_iteration": 2.913569211959839 }, { "auxiliary_loss_clip": 0.01135683, "auxiliary_loss_mlp": 0.01057111, "balance_loss_clip": 1.0490278, "balance_loss_mlp": 1.0349735, "epoch": 0.0945137531940478, "flos": 19715353372800.0, "grad_norm": 1.6745378641752047, "language_loss": 0.79397607, "learning_rate": 3.956500096627561e-06, "loss": 0.81590402, "num_input_tokens_seen": 33743455, "step": 1572, "time_per_iteration": 2.813410758972168 }, { "auxiliary_loss_clip": 0.01148251, "auxiliary_loss_mlp": 0.0106927, "balance_loss_clip": 1.05619979, "balance_loss_mlp": 1.04524922, "epoch": 0.09457387644671576, "flos": 23617047375360.0, "grad_norm": 1.7559396294879055, "language_loss": 0.87707287, "learning_rate": 3.956419273835913e-06, "loss": 0.89924812, "num_input_tokens_seen": 33763435, "step": 1573, "time_per_iteration": 2.776535987854004 }, { "auxiliary_loss_clip": 0.01161183, "auxiliary_loss_mlp": 0.01063326, "balance_loss_clip": 1.05485129, "balance_loss_mlp": 1.03804219, "epoch": 0.09463399969938374, "flos": 26907291135360.0, "grad_norm": 2.9707854698090097, "language_loss": 0.81982428, "learning_rate": 3.95633837685665e-06, "loss": 0.84206939, "num_input_tokens_seen": 33784325, "step": 1574, "time_per_iteration": 2.7604806423187256 }, { "auxiliary_loss_clip": 0.01156287, "auxiliary_loss_mlp": 0.01055594, "balance_loss_clip": 1.05234718, "balance_loss_mlp": 1.0344342, "epoch": 0.0946941229520517, "flos": 23659566099840.0, "grad_norm": 1.7178511535677499, "language_loss": 0.80855322, "learning_rate": 3.95625740569284e-06, "loss": 0.83067203, "num_input_tokens_seen": 33802510, "step": 1575, "time_per_iteration": 2.713247299194336 }, { "auxiliary_loss_clip": 0.01182326, "auxiliary_loss_mlp": 0.01068689, "balance_loss_clip": 1.05578864, "balance_loss_mlp": 1.04581285, "epoch": 0.09475424620471967, "flos": 24134053783680.0, "grad_norm": 1.9110861379460222, "language_loss": 0.86483347, "learning_rate": 3.956176360347553e-06, "loss": 0.88734365, "num_input_tokens_seen": 33819980, "step": 1576, "time_per_iteration": 2.682644844055176 }, { "auxiliary_loss_clip": 0.01056441, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.0225811, "balance_loss_mlp": 1.02344561, "epoch": 0.09481436945738765, "flos": 68426168065920.0, "grad_norm": 0.9789918611127905, "language_loss": 0.6582402, "learning_rate": 3.956095240823862e-06, "loss": 0.67907751, "num_input_tokens_seen": 33878925, "step": 1577, "time_per_iteration": 3.2106685638427734 }, { "auxiliary_loss_clip": 0.01147668, "auxiliary_loss_mlp": 0.01051958, "balance_loss_clip": 1.05218005, "balance_loss_mlp": 1.03098869, "epoch": 0.09487449271005562, "flos": 16654076858880.0, "grad_norm": 1.8223175005615506, "language_loss": 0.79152733, "learning_rate": 3.956014047124844e-06, "loss": 0.81352365, "num_input_tokens_seen": 33897600, "step": 1578, "time_per_iteration": 2.820089340209961 }, { "auxiliary_loss_clip": 0.01185941, "auxiliary_loss_mlp": 0.01066432, "balance_loss_clip": 1.05838132, "balance_loss_mlp": 1.04437804, "epoch": 0.09493461596272358, "flos": 24275685110400.0, "grad_norm": 3.480730999818176, "language_loss": 0.78161818, "learning_rate": 3.955932779253578e-06, "loss": 0.80414188, "num_input_tokens_seen": 33917365, "step": 1579, "time_per_iteration": 2.6518983840942383 }, { "auxiliary_loss_clip": 0.01128319, "auxiliary_loss_mlp": 0.01065633, "balance_loss_clip": 1.04771328, "balance_loss_mlp": 1.04001498, "epoch": 0.09499473921539155, "flos": 21870173243520.0, "grad_norm": 2.0084876987684526, "language_loss": 0.73410392, "learning_rate": 3.955851437213144e-06, "loss": 0.75604343, "num_input_tokens_seen": 33936680, "step": 1580, "time_per_iteration": 2.679461717605591 }, { "auxiliary_loss_clip": 0.01157568, "auxiliary_loss_mlp": 0.01062628, "balance_loss_clip": 1.05573344, "balance_loss_mlp": 1.04095626, "epoch": 0.09505486246805953, "flos": 33547137880320.0, "grad_norm": 14.809542792179553, "language_loss": 0.77565914, "learning_rate": 3.955770021006627e-06, "loss": 0.7978611, "num_input_tokens_seen": 33960685, "step": 1581, "time_per_iteration": 2.765394449234009 }, { "auxiliary_loss_clip": 0.01144835, "auxiliary_loss_mlp": 0.0106468, "balance_loss_clip": 1.05426359, "balance_loss_mlp": 1.04276967, "epoch": 0.09511498572072749, "flos": 21215342350080.0, "grad_norm": 1.8617167187056045, "language_loss": 0.87230825, "learning_rate": 3.955688530637116e-06, "loss": 0.89440346, "num_input_tokens_seen": 33980015, "step": 1582, "time_per_iteration": 2.691364288330078 }, { "auxiliary_loss_clip": 0.01174295, "auxiliary_loss_mlp": 0.0106431, "balance_loss_clip": 1.05508888, "balance_loss_mlp": 1.04039705, "epoch": 0.09517510897339546, "flos": 14611262572800.0, "grad_norm": 1.8512060219658202, "language_loss": 0.67043924, "learning_rate": 3.955606966107699e-06, "loss": 0.69282532, "num_input_tokens_seen": 33997705, "step": 1583, "time_per_iteration": 2.6693732738494873 }, { "auxiliary_loss_clip": 0.01177751, "auxiliary_loss_mlp": 0.01053743, "balance_loss_clip": 1.0593859, "balance_loss_mlp": 1.03035378, "epoch": 0.09523523222606343, "flos": 27817339138560.0, "grad_norm": 2.144216926782962, "language_loss": 0.70752859, "learning_rate": 3.95552532742147e-06, "loss": 0.7298435, "num_input_tokens_seen": 34017465, "step": 1584, "time_per_iteration": 2.7164390087127686 }, { "auxiliary_loss_clip": 0.01138507, "auxiliary_loss_mlp": 0.0105762, "balance_loss_clip": 1.05243039, "balance_loss_mlp": 1.03584039, "epoch": 0.0952953554787314, "flos": 20706272847360.0, "grad_norm": 1.4654737580846544, "language_loss": 0.8080442, "learning_rate": 3.955443614581525e-06, "loss": 0.83000553, "num_input_tokens_seen": 34038550, "step": 1585, "time_per_iteration": 2.879831314086914 }, { "auxiliary_loss_clip": 0.01159374, "auxiliary_loss_mlp": 0.01057717, "balance_loss_clip": 1.05387473, "balance_loss_mlp": 1.03355336, "epoch": 0.09535547873139937, "flos": 24787627701120.0, "grad_norm": 1.638250735795891, "language_loss": 0.71921158, "learning_rate": 3.955361827590961e-06, "loss": 0.74138248, "num_input_tokens_seen": 34058665, "step": 1586, "time_per_iteration": 2.750436544418335 }, { "auxiliary_loss_clip": 0.01048565, "auxiliary_loss_mlp": 0.01003302, "balance_loss_clip": 1.03115988, "balance_loss_mlp": 0.99901009, "epoch": 0.09541560198406734, "flos": 71912194905600.0, "grad_norm": 0.8099482252624973, "language_loss": 0.55475175, "learning_rate": 3.955279966452883e-06, "loss": 0.57527041, "num_input_tokens_seen": 34109655, "step": 1587, "time_per_iteration": 3.0975699424743652 }, { "auxiliary_loss_clip": 0.01128884, "auxiliary_loss_mlp": 0.0105965, "balance_loss_clip": 1.04768586, "balance_loss_mlp": 1.03661847, "epoch": 0.09547572523673531, "flos": 28982604251520.0, "grad_norm": 1.708481785076906, "language_loss": 0.81062275, "learning_rate": 3.955198031170391e-06, "loss": 0.83250809, "num_input_tokens_seen": 34131115, "step": 1588, "time_per_iteration": 2.7718451023101807 }, { "auxiliary_loss_clip": 0.01131602, "auxiliary_loss_mlp": 0.01056117, "balance_loss_clip": 1.04894614, "balance_loss_mlp": 1.03438473, "epoch": 0.09553584848940327, "flos": 24133910129280.0, "grad_norm": 1.5119879232668088, "language_loss": 0.81481898, "learning_rate": 3.955116021746594e-06, "loss": 0.83669615, "num_input_tokens_seen": 34151925, "step": 1589, "time_per_iteration": 2.782468795776367 }, { "auxiliary_loss_clip": 0.0112194, "auxiliary_loss_mlp": 0.00780573, "balance_loss_clip": 1.0508883, "balance_loss_mlp": 1.00013089, "epoch": 0.09559597174207124, "flos": 42851376789120.0, "grad_norm": 1.525287399882202, "language_loss": 0.64882791, "learning_rate": 3.955033938184601e-06, "loss": 0.667853, "num_input_tokens_seen": 34175395, "step": 1590, "time_per_iteration": 3.0783450603485107 }, { "auxiliary_loss_clip": 0.01143501, "auxiliary_loss_mlp": 0.01058399, "balance_loss_clip": 1.05087948, "balance_loss_mlp": 1.0358206, "epoch": 0.09565609499473922, "flos": 32670845683200.0, "grad_norm": 2.0745314237741916, "language_loss": 0.83290577, "learning_rate": 3.954951780487526e-06, "loss": 0.85492468, "num_input_tokens_seen": 34197760, "step": 1591, "time_per_iteration": 2.8393962383270264 }, { "auxiliary_loss_clip": 0.01163486, "auxiliary_loss_mlp": 0.01065588, "balance_loss_clip": 1.0522387, "balance_loss_mlp": 1.04266405, "epoch": 0.09571621824740718, "flos": 18478410670080.0, "grad_norm": 2.825705290827541, "language_loss": 0.74087322, "learning_rate": 3.9548695486584835e-06, "loss": 0.76316392, "num_input_tokens_seen": 34215330, "step": 1592, "time_per_iteration": 2.6828882694244385 }, { "auxiliary_loss_clip": 0.01169239, "auxiliary_loss_mlp": 0.01055073, "balance_loss_clip": 1.05161428, "balance_loss_mlp": 1.03337741, "epoch": 0.09577634150007515, "flos": 29387497334400.0, "grad_norm": 2.18277080043521, "language_loss": 0.74483889, "learning_rate": 3.954787242700592e-06, "loss": 0.76708198, "num_input_tokens_seen": 34237745, "step": 1593, "time_per_iteration": 2.7193498611450195 }, { "auxiliary_loss_clip": 0.01177343, "auxiliary_loss_mlp": 0.01055096, "balance_loss_clip": 1.05910873, "balance_loss_mlp": 1.03307831, "epoch": 0.09583646475274313, "flos": 22747830157440.0, "grad_norm": 1.887493467708827, "language_loss": 0.69782627, "learning_rate": 3.954704862616971e-06, "loss": 0.72015071, "num_input_tokens_seen": 34256565, "step": 1594, "time_per_iteration": 2.635383367538452 }, { "auxiliary_loss_clip": 0.01173222, "auxiliary_loss_mlp": 0.01051806, "balance_loss_clip": 1.05618978, "balance_loss_mlp": 1.03037214, "epoch": 0.0958965880054111, "flos": 23218367345280.0, "grad_norm": 2.1411006117727682, "language_loss": 0.82780552, "learning_rate": 3.954622408410747e-06, "loss": 0.85005581, "num_input_tokens_seen": 34275970, "step": 1595, "time_per_iteration": 2.7158257961273193 }, { "auxiliary_loss_clip": 0.01153253, "auxiliary_loss_mlp": 0.01054246, "balance_loss_clip": 1.05143809, "balance_loss_mlp": 1.0301652, "epoch": 0.09595671125807906, "flos": 21324438933120.0, "grad_norm": 1.7751890788987925, "language_loss": 0.84513396, "learning_rate": 3.954539880085045e-06, "loss": 0.86720896, "num_input_tokens_seen": 34295490, "step": 1596, "time_per_iteration": 2.710228204727173 }, { "auxiliary_loss_clip": 0.01166586, "auxiliary_loss_mlp": 0.0105804, "balance_loss_clip": 1.05440903, "balance_loss_mlp": 1.03376901, "epoch": 0.09601683451074704, "flos": 39603472185600.0, "grad_norm": 1.8335529067237837, "language_loss": 0.69328064, "learning_rate": 3.9544572776429945e-06, "loss": 0.71552688, "num_input_tokens_seen": 34319990, "step": 1597, "time_per_iteration": 2.802959442138672 }, { "auxiliary_loss_clip": 0.01167235, "auxiliary_loss_mlp": 0.00780978, "balance_loss_clip": 1.0503217, "balance_loss_mlp": 1.00010371, "epoch": 0.096076957763415, "flos": 23732716147200.0, "grad_norm": 2.0491570740921885, "language_loss": 0.7486403, "learning_rate": 3.954374601087729e-06, "loss": 0.76812243, "num_input_tokens_seen": 34339225, "step": 1598, "time_per_iteration": 2.6502270698547363 }, { "auxiliary_loss_clip": 0.01176661, "auxiliary_loss_mlp": 0.01053936, "balance_loss_clip": 1.05745888, "balance_loss_mlp": 1.03009462, "epoch": 0.09613708101608297, "flos": 34678108483200.0, "grad_norm": 1.6831440826618358, "language_loss": 0.68804371, "learning_rate": 3.954291850422382e-06, "loss": 0.71034968, "num_input_tokens_seen": 34361020, "step": 1599, "time_per_iteration": 2.74243426322937 }, { "auxiliary_loss_clip": 0.01157322, "auxiliary_loss_mlp": 0.01059883, "balance_loss_clip": 1.05754852, "balance_loss_mlp": 1.0371263, "epoch": 0.09619720426875093, "flos": 20740028653440.0, "grad_norm": 2.9774251326108367, "language_loss": 0.83950365, "learning_rate": 3.954209025650093e-06, "loss": 0.86167574, "num_input_tokens_seen": 34378630, "step": 1600, "time_per_iteration": 2.702907085418701 }, { "auxiliary_loss_clip": 0.01150263, "auxiliary_loss_mlp": 0.01054168, "balance_loss_clip": 1.05129707, "balance_loss_mlp": 1.03093433, "epoch": 0.09625732752141891, "flos": 13042720488960.0, "grad_norm": 2.287254549480118, "language_loss": 0.80520785, "learning_rate": 3.954126126774001e-06, "loss": 0.82725215, "num_input_tokens_seen": 34397110, "step": 1601, "time_per_iteration": 2.693399429321289 }, { "auxiliary_loss_clip": 0.01181247, "auxiliary_loss_mlp": 0.01054578, "balance_loss_clip": 1.05711937, "balance_loss_mlp": 1.03133249, "epoch": 0.09631745077408688, "flos": 22273629782400.0, "grad_norm": 2.4356926646094954, "language_loss": 0.81959623, "learning_rate": 3.954043153797251e-06, "loss": 0.84195447, "num_input_tokens_seen": 34414165, "step": 1602, "time_per_iteration": 2.639479875564575 }, { "auxiliary_loss_clip": 0.01137855, "auxiliary_loss_mlp": 0.01051495, "balance_loss_clip": 1.05295444, "balance_loss_mlp": 1.02681863, "epoch": 0.09637757402675484, "flos": 24754266944640.0, "grad_norm": 3.099164686790191, "language_loss": 0.62498438, "learning_rate": 3.953960106722989e-06, "loss": 0.64687788, "num_input_tokens_seen": 34434445, "step": 1603, "time_per_iteration": 4.341834306716919 }, { "auxiliary_loss_clip": 0.01189954, "auxiliary_loss_mlp": 0.01054376, "balance_loss_clip": 1.05902839, "balance_loss_mlp": 1.02918696, "epoch": 0.09643769727942282, "flos": 22525758322560.0, "grad_norm": 3.121905357886113, "language_loss": 0.70996022, "learning_rate": 3.953876985554364e-06, "loss": 0.73240346, "num_input_tokens_seen": 34453095, "step": 1604, "time_per_iteration": 2.6520893573760986 }, { "auxiliary_loss_clip": 0.01176446, "auxiliary_loss_mlp": 0.01055314, "balance_loss_clip": 1.0570209, "balance_loss_mlp": 1.03358221, "epoch": 0.09649782053209079, "flos": 30921026636160.0, "grad_norm": 2.082890345500055, "language_loss": 0.7993719, "learning_rate": 3.953793790294527e-06, "loss": 0.82168949, "num_input_tokens_seen": 34473680, "step": 1605, "time_per_iteration": 4.5557661056518555 }, { "auxiliary_loss_clip": 0.01161047, "auxiliary_loss_mlp": 0.01047918, "balance_loss_clip": 1.05455577, "balance_loss_mlp": 1.0245893, "epoch": 0.09655794378475875, "flos": 25337635729920.0, "grad_norm": 1.990204665194141, "language_loss": 0.74550986, "learning_rate": 3.953710520946634e-06, "loss": 0.76759952, "num_input_tokens_seen": 34492610, "step": 1606, "time_per_iteration": 2.7172651290893555 }, { "auxiliary_loss_clip": 0.01172416, "auxiliary_loss_mlp": 0.01046772, "balance_loss_clip": 1.05834222, "balance_loss_mlp": 1.02378857, "epoch": 0.09661806703742673, "flos": 22346061557760.0, "grad_norm": 1.6403710807101601, "language_loss": 0.7571919, "learning_rate": 3.953627177513843e-06, "loss": 0.77938372, "num_input_tokens_seen": 34511855, "step": 1607, "time_per_iteration": 4.302686452865601 }, { "auxiliary_loss_clip": 0.01139491, "auxiliary_loss_mlp": 0.01051546, "balance_loss_clip": 1.04833579, "balance_loss_mlp": 1.0289799, "epoch": 0.0966781902900947, "flos": 17457578144640.0, "grad_norm": 1.975850982703557, "language_loss": 0.86756283, "learning_rate": 3.953543759999312e-06, "loss": 0.88947326, "num_input_tokens_seen": 34528905, "step": 1608, "time_per_iteration": 2.6280455589294434 }, { "auxiliary_loss_clip": 0.01126253, "auxiliary_loss_mlp": 0.01064704, "balance_loss_clip": 1.05433142, "balance_loss_mlp": 1.03940821, "epoch": 0.09673831354276266, "flos": 36903995412480.0, "grad_norm": 2.3082762386200266, "language_loss": 0.71363097, "learning_rate": 3.953460268406207e-06, "loss": 0.73554057, "num_input_tokens_seen": 34548480, "step": 1609, "time_per_iteration": 2.9116146564483643 }, { "auxiliary_loss_clip": 0.01149353, "auxiliary_loss_mlp": 0.01058179, "balance_loss_clip": 1.0546515, "balance_loss_mlp": 1.03606534, "epoch": 0.09679843679543064, "flos": 20701388597760.0, "grad_norm": 1.9988414994799784, "language_loss": 0.84810984, "learning_rate": 3.953376702737693e-06, "loss": 0.87018514, "num_input_tokens_seen": 34565410, "step": 1610, "time_per_iteration": 2.8005051612854004 }, { "auxiliary_loss_clip": 0.01161389, "auxiliary_loss_mlp": 0.01056267, "balance_loss_clip": 1.05790925, "balance_loss_mlp": 1.03228188, "epoch": 0.0968585600480986, "flos": 23514415240320.0, "grad_norm": 2.176236379770122, "language_loss": 0.6696198, "learning_rate": 3.953293062996939e-06, "loss": 0.69179636, "num_input_tokens_seen": 34584840, "step": 1611, "time_per_iteration": 2.731931447982788 }, { "auxiliary_loss_clip": 0.01125259, "auxiliary_loss_mlp": 0.01057116, "balance_loss_clip": 1.04740572, "balance_loss_mlp": 1.03385806, "epoch": 0.09691868330076657, "flos": 20121072468480.0, "grad_norm": 1.6508278294088392, "language_loss": 0.81067657, "learning_rate": 3.953209349187115e-06, "loss": 0.83250034, "num_input_tokens_seen": 34603360, "step": 1612, "time_per_iteration": 2.7998390197753906 }, { "auxiliary_loss_clip": 0.01182404, "auxiliary_loss_mlp": 0.01069551, "balance_loss_clip": 1.06046534, "balance_loss_mlp": 1.04600716, "epoch": 0.09697880655343454, "flos": 16544692967040.0, "grad_norm": 3.304939197664143, "language_loss": 0.80836105, "learning_rate": 3.953125561311398e-06, "loss": 0.83088064, "num_input_tokens_seen": 34620760, "step": 1613, "time_per_iteration": 2.624218702316284 }, { "auxiliary_loss_clip": 0.01148565, "auxiliary_loss_mlp": 0.01054743, "balance_loss_clip": 1.05542159, "balance_loss_mlp": 1.03047192, "epoch": 0.09703892980610251, "flos": 26104184899200.0, "grad_norm": 1.7164386274315457, "language_loss": 0.84289789, "learning_rate": 3.953041699372964e-06, "loss": 0.86493099, "num_input_tokens_seen": 34640695, "step": 1614, "time_per_iteration": 2.744340419769287 }, { "auxiliary_loss_clip": 0.01066618, "auxiliary_loss_mlp": 0.00759744, "balance_loss_clip": 1.02654934, "balance_loss_mlp": 1.00008702, "epoch": 0.09709905305877048, "flos": 60443622000000.0, "grad_norm": 0.7127167896900892, "language_loss": 0.54629624, "learning_rate": 3.952957763374992e-06, "loss": 0.56455994, "num_input_tokens_seen": 34702395, "step": 1615, "time_per_iteration": 3.1547679901123047 }, { "auxiliary_loss_clip": 0.01033143, "auxiliary_loss_mlp": 0.01017555, "balance_loss_clip": 1.02384067, "balance_loss_mlp": 1.01381195, "epoch": 0.09715917631143844, "flos": 57639932893440.0, "grad_norm": 0.7689373847786285, "language_loss": 0.58190405, "learning_rate": 3.952873753320666e-06, "loss": 0.60241103, "num_input_tokens_seen": 34768910, "step": 1616, "time_per_iteration": 3.3940556049346924 }, { "auxiliary_loss_clip": 0.01155533, "auxiliary_loss_mlp": 0.01067983, "balance_loss_clip": 1.05504358, "balance_loss_mlp": 1.04205465, "epoch": 0.09721929956410642, "flos": 20558212986240.0, "grad_norm": 1.8932449927934136, "language_loss": 0.69031835, "learning_rate": 3.952789669213172e-06, "loss": 0.7125535, "num_input_tokens_seen": 34787680, "step": 1617, "time_per_iteration": 2.714629888534546 }, { "auxiliary_loss_clip": 0.01152637, "auxiliary_loss_mlp": 0.01057882, "balance_loss_clip": 1.05386162, "balance_loss_mlp": 1.03127456, "epoch": 0.09727942281677439, "flos": 27344359825920.0, "grad_norm": 1.755493071880773, "language_loss": 0.80910909, "learning_rate": 3.952705511055698e-06, "loss": 0.83121431, "num_input_tokens_seen": 34808330, "step": 1618, "time_per_iteration": 2.8081507682800293 }, { "auxiliary_loss_clip": 0.01168356, "auxiliary_loss_mlp": 0.01058179, "balance_loss_clip": 1.06048679, "balance_loss_mlp": 1.03678131, "epoch": 0.09733954606944235, "flos": 24900028335360.0, "grad_norm": 1.667659488760432, "language_loss": 0.92901695, "learning_rate": 3.952621278851435e-06, "loss": 0.95128226, "num_input_tokens_seen": 34830020, "step": 1619, "time_per_iteration": 2.7752275466918945 }, { "auxiliary_loss_clip": 0.01175515, "auxiliary_loss_mlp": 0.01058252, "balance_loss_clip": 1.05952573, "balance_loss_mlp": 1.03512526, "epoch": 0.09739966932211033, "flos": 31503928544640.0, "grad_norm": 2.1973967195348902, "language_loss": 0.88978708, "learning_rate": 3.9525369726035784e-06, "loss": 0.91212475, "num_input_tokens_seen": 34850330, "step": 1620, "time_per_iteration": 2.771176338195801 }, { "auxiliary_loss_clip": 0.01153763, "auxiliary_loss_mlp": 0.01065329, "balance_loss_clip": 1.05353975, "balance_loss_mlp": 1.0397464, "epoch": 0.0974597925747783, "flos": 23878764846720.0, "grad_norm": 2.154793183838835, "language_loss": 0.77331412, "learning_rate": 3.952452592315324e-06, "loss": 0.79550499, "num_input_tokens_seen": 34871640, "step": 1621, "time_per_iteration": 2.6740832328796387 }, { "auxiliary_loss_clip": 0.01131342, "auxiliary_loss_mlp": 0.01082359, "balance_loss_clip": 1.04798269, "balance_loss_mlp": 1.05640674, "epoch": 0.09751991582744626, "flos": 17019575700480.0, "grad_norm": 1.9420195171733425, "language_loss": 0.77671158, "learning_rate": 3.952368137989871e-06, "loss": 0.79884863, "num_input_tokens_seen": 34888100, "step": 1622, "time_per_iteration": 2.7247347831726074 }, { "auxiliary_loss_clip": 0.01150185, "auxiliary_loss_mlp": 0.01064277, "balance_loss_clip": 1.05335355, "balance_loss_mlp": 1.04025626, "epoch": 0.09758003908011423, "flos": 28402826826240.0, "grad_norm": 1.8603109065807166, "language_loss": 0.85784447, "learning_rate": 3.9522836096304225e-06, "loss": 0.87998909, "num_input_tokens_seen": 34910485, "step": 1623, "time_per_iteration": 2.785388469696045 }, { "auxiliary_loss_clip": 0.0117659, "auxiliary_loss_mlp": 0.01064102, "balance_loss_clip": 1.05769634, "balance_loss_mlp": 1.04043913, "epoch": 0.09764016233278221, "flos": 18144297336960.0, "grad_norm": 2.39630116599036, "language_loss": 0.80534065, "learning_rate": 3.952199007240184e-06, "loss": 0.82774758, "num_input_tokens_seen": 34928615, "step": 1624, "time_per_iteration": 2.6818184852600098 }, { "auxiliary_loss_clip": 0.01176335, "auxiliary_loss_mlp": 0.01056788, "balance_loss_clip": 1.05616927, "balance_loss_mlp": 1.03465128, "epoch": 0.09770028558545017, "flos": 15265842071040.0, "grad_norm": 2.44379144971104, "language_loss": 0.85556966, "learning_rate": 3.952114330822364e-06, "loss": 0.8779009, "num_input_tokens_seen": 34946045, "step": 1625, "time_per_iteration": 2.6594324111938477 }, { "auxiliary_loss_clip": 0.01181411, "auxiliary_loss_mlp": 0.0106682, "balance_loss_clip": 1.06004012, "balance_loss_mlp": 1.04411101, "epoch": 0.09776040883811814, "flos": 23472435219840.0, "grad_norm": 2.058269503362464, "language_loss": 0.85431635, "learning_rate": 3.952029580380172e-06, "loss": 0.87679869, "num_input_tokens_seen": 34962865, "step": 1626, "time_per_iteration": 2.7384841442108154 }, { "auxiliary_loss_clip": 0.01165311, "auxiliary_loss_mlp": 0.007823, "balance_loss_clip": 1.05467701, "balance_loss_mlp": 1.000211, "epoch": 0.09782053209078612, "flos": 24499480798080.0, "grad_norm": 2.0701580273163036, "language_loss": 0.83370024, "learning_rate": 3.9519447559168234e-06, "loss": 0.85317636, "num_input_tokens_seen": 34983505, "step": 1627, "time_per_iteration": 2.8269948959350586 }, { "auxiliary_loss_clip": 0.01168188, "auxiliary_loss_mlp": 0.01065332, "balance_loss_clip": 1.05557203, "balance_loss_mlp": 1.04275417, "epoch": 0.09788065534345408, "flos": 21580158833280.0, "grad_norm": 1.8143281262319713, "language_loss": 0.84674478, "learning_rate": 3.951859857435534e-06, "loss": 0.86907995, "num_input_tokens_seen": 35001825, "step": 1628, "time_per_iteration": 2.6151821613311768 }, { "auxiliary_loss_clip": 0.01170257, "auxiliary_loss_mlp": 0.01058367, "balance_loss_clip": 1.05374515, "balance_loss_mlp": 1.03558636, "epoch": 0.09794077859612205, "flos": 23842459175040.0, "grad_norm": 1.5658807312485334, "language_loss": 0.75531614, "learning_rate": 3.951774884939523e-06, "loss": 0.77760237, "num_input_tokens_seen": 35023075, "step": 1629, "time_per_iteration": 2.6794557571411133 }, { "auxiliary_loss_clip": 0.01129604, "auxiliary_loss_mlp": 0.01056904, "balance_loss_clip": 1.0577755, "balance_loss_mlp": 1.03169131, "epoch": 0.09800090184879003, "flos": 23659889322240.0, "grad_norm": 1.6755762488260617, "language_loss": 0.78487194, "learning_rate": 3.951689838432013e-06, "loss": 0.80673707, "num_input_tokens_seen": 35043480, "step": 1630, "time_per_iteration": 2.7986228466033936 }, { "auxiliary_loss_clip": 0.01167766, "auxiliary_loss_mlp": 0.01063441, "balance_loss_clip": 1.05938148, "balance_loss_mlp": 1.03804946, "epoch": 0.09806102510145799, "flos": 17055773631360.0, "grad_norm": 1.8175370389297836, "language_loss": 0.86677933, "learning_rate": 3.951604717916228e-06, "loss": 0.88909143, "num_input_tokens_seen": 35061490, "step": 1631, "time_per_iteration": 2.6350157260894775 }, { "auxiliary_loss_clip": 0.01171369, "auxiliary_loss_mlp": 0.01058643, "balance_loss_clip": 1.0610745, "balance_loss_mlp": 1.03625536, "epoch": 0.09812114835412596, "flos": 23878477537920.0, "grad_norm": 2.2030333753544773, "language_loss": 0.82996809, "learning_rate": 3.9515195233953975e-06, "loss": 0.85226822, "num_input_tokens_seen": 35079670, "step": 1632, "time_per_iteration": 2.7990314960479736 }, { "auxiliary_loss_clip": 0.01148453, "auxiliary_loss_mlp": 0.01064004, "balance_loss_clip": 1.05554819, "balance_loss_mlp": 1.04102039, "epoch": 0.09818127160679392, "flos": 20595488325120.0, "grad_norm": 1.531777801288569, "language_loss": 0.7882973, "learning_rate": 3.951434254872751e-06, "loss": 0.81042188, "num_input_tokens_seen": 35099205, "step": 1633, "time_per_iteration": 2.735353708267212 }, { "auxiliary_loss_clip": 0.01170992, "auxiliary_loss_mlp": 0.01061681, "balance_loss_clip": 1.05558002, "balance_loss_mlp": 1.03731489, "epoch": 0.0982413948594619, "flos": 15487339288320.0, "grad_norm": 2.4037572513069687, "language_loss": 0.73209554, "learning_rate": 3.951348912351521e-06, "loss": 0.75442231, "num_input_tokens_seen": 35115270, "step": 1634, "time_per_iteration": 2.688596248626709 }, { "auxiliary_loss_clip": 0.01162743, "auxiliary_loss_mlp": 0.01071164, "balance_loss_clip": 1.05591321, "balance_loss_mlp": 1.04672611, "epoch": 0.09830151811212987, "flos": 24207958016640.0, "grad_norm": 3.2244021303311405, "language_loss": 0.72553629, "learning_rate": 3.951263495834947e-06, "loss": 0.74787533, "num_input_tokens_seen": 35134065, "step": 1635, "time_per_iteration": 2.720266342163086 }, { "auxiliary_loss_clip": 0.01154765, "auxiliary_loss_mlp": 0.01068349, "balance_loss_clip": 1.05526268, "balance_loss_mlp": 1.04177701, "epoch": 0.09836164136479783, "flos": 20594590485120.0, "grad_norm": 1.7699592352066487, "language_loss": 0.78026646, "learning_rate": 3.951178005326264e-06, "loss": 0.80249763, "num_input_tokens_seen": 35154870, "step": 1636, "time_per_iteration": 2.9618239402770996 }, { "auxiliary_loss_clip": 0.01162744, "auxiliary_loss_mlp": 0.01060716, "balance_loss_clip": 1.05561686, "balance_loss_mlp": 1.0368979, "epoch": 0.09842176461746581, "flos": 19934157070080.0, "grad_norm": 1.8332710343018006, "language_loss": 0.69524407, "learning_rate": 3.951092440828715e-06, "loss": 0.71747863, "num_input_tokens_seen": 35171850, "step": 1637, "time_per_iteration": 2.671178102493286 }, { "auxiliary_loss_clip": 0.01188316, "auxiliary_loss_mlp": 0.01058851, "balance_loss_clip": 1.05926394, "balance_loss_mlp": 1.03500926, "epoch": 0.09848188787013377, "flos": 21214659991680.0, "grad_norm": 2.3775286970935503, "language_loss": 0.77050996, "learning_rate": 3.951006802345545e-06, "loss": 0.79298162, "num_input_tokens_seen": 35188795, "step": 1638, "time_per_iteration": 2.62457537651062 }, { "auxiliary_loss_clip": 0.01140265, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.05538166, "balance_loss_mlp": 1.02941203, "epoch": 0.09854201112280174, "flos": 30154226071680.0, "grad_norm": 1.4014263071342075, "language_loss": 0.72620296, "learning_rate": 3.950921089880003e-06, "loss": 0.74812591, "num_input_tokens_seen": 35212100, "step": 1639, "time_per_iteration": 2.7499618530273438 }, { "auxiliary_loss_clip": 0.01173752, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.0582087, "balance_loss_mlp": 1.02831531, "epoch": 0.09860213437546972, "flos": 21795730306560.0, "grad_norm": 1.7213189449892274, "language_loss": 0.88679075, "learning_rate": 3.950835303435337e-06, "loss": 0.90904212, "num_input_tokens_seen": 35230390, "step": 1640, "time_per_iteration": 2.664133071899414 }, { "auxiliary_loss_clip": 0.01177786, "auxiliary_loss_mlp": 0.01044457, "balance_loss_clip": 1.05981517, "balance_loss_mlp": 1.02130616, "epoch": 0.09866225762813768, "flos": 21835555511040.0, "grad_norm": 2.0701766566296915, "language_loss": 0.80567038, "learning_rate": 3.950749443014801e-06, "loss": 0.82789278, "num_input_tokens_seen": 35250405, "step": 1641, "time_per_iteration": 2.645353317260742 }, { "auxiliary_loss_clip": 0.011756, "auxiliary_loss_mlp": 0.01062641, "balance_loss_clip": 1.05896795, "balance_loss_mlp": 1.03742838, "epoch": 0.09872238088080565, "flos": 17599855916160.0, "grad_norm": 2.64335263522248, "language_loss": 0.86117625, "learning_rate": 3.95066350862165e-06, "loss": 0.88355863, "num_input_tokens_seen": 35262820, "step": 1642, "time_per_iteration": 5.81004524230957 }, { "auxiliary_loss_clip": 0.01151329, "auxiliary_loss_mlp": 0.01056693, "balance_loss_clip": 1.05857074, "balance_loss_mlp": 1.03404331, "epoch": 0.09878250413347361, "flos": 27636134002560.0, "grad_norm": 2.7092208079201607, "language_loss": 0.8058275, "learning_rate": 3.950577500259144e-06, "loss": 0.82790768, "num_input_tokens_seen": 35284490, "step": 1643, "time_per_iteration": 2.7235090732574463 }, { "auxiliary_loss_clip": 0.01174075, "auxiliary_loss_mlp": 0.01077435, "balance_loss_clip": 1.05761337, "balance_loss_mlp": 1.05470192, "epoch": 0.0988426273861416, "flos": 16544728880640.0, "grad_norm": 2.0561742686210676, "language_loss": 0.82546467, "learning_rate": 3.950491417930543e-06, "loss": 0.84797978, "num_input_tokens_seen": 35302815, "step": 1644, "time_per_iteration": 4.318823575973511 }, { "auxiliary_loss_clip": 0.01163142, "auxiliary_loss_mlp": 0.00782463, "balance_loss_clip": 1.05607629, "balance_loss_mlp": 1.00010633, "epoch": 0.09890275063880956, "flos": 21215270522880.0, "grad_norm": 1.6945489721625269, "language_loss": 0.68219113, "learning_rate": 3.9504052616391124e-06, "loss": 0.70164716, "num_input_tokens_seen": 35321175, "step": 1645, "time_per_iteration": 2.6626670360565186 }, { "auxiliary_loss_clip": 0.01059795, "auxiliary_loss_mlp": 0.01047617, "balance_loss_clip": 1.02852345, "balance_loss_mlp": 1.04404068, "epoch": 0.09896287389147752, "flos": 59379372910080.0, "grad_norm": 0.8512889940087613, "language_loss": 0.60885167, "learning_rate": 3.950319031388119e-06, "loss": 0.62992585, "num_input_tokens_seen": 35381740, "step": 1646, "time_per_iteration": 4.752669095993042 }, { "auxiliary_loss_clip": 0.01147006, "auxiliary_loss_mlp": 0.0105976, "balance_loss_clip": 1.0574733, "balance_loss_mlp": 1.03464222, "epoch": 0.0990229971441455, "flos": 29642678530560.0, "grad_norm": 5.785751121573768, "language_loss": 0.73211443, "learning_rate": 3.950232727180833e-06, "loss": 0.7541821, "num_input_tokens_seen": 35403760, "step": 1647, "time_per_iteration": 2.783442974090576 }, { "auxiliary_loss_clip": 0.01161789, "auxiliary_loss_mlp": 0.01066314, "balance_loss_clip": 1.06016421, "balance_loss_mlp": 1.04445136, "epoch": 0.09908312039681347, "flos": 21834873152640.0, "grad_norm": 1.828428298130997, "language_loss": 0.84094375, "learning_rate": 3.950146349020525e-06, "loss": 0.86322474, "num_input_tokens_seen": 35424050, "step": 1648, "time_per_iteration": 2.709559679031372 }, { "auxiliary_loss_clip": 0.01065954, "auxiliary_loss_mlp": 0.01020799, "balance_loss_clip": 1.02565169, "balance_loss_mlp": 1.01722264, "epoch": 0.09914324364948143, "flos": 57564304807680.0, "grad_norm": 0.7317434537206132, "language_loss": 0.55672908, "learning_rate": 3.950059896910473e-06, "loss": 0.5775966, "num_input_tokens_seen": 35481690, "step": 1649, "time_per_iteration": 3.0944156646728516 }, { "auxiliary_loss_clip": 0.0117133, "auxiliary_loss_mlp": 0.01049543, "balance_loss_clip": 1.05603158, "balance_loss_mlp": 1.02723897, "epoch": 0.09920336690214941, "flos": 34123934476800.0, "grad_norm": 2.195431109372502, "language_loss": 0.8975327, "learning_rate": 3.949973370853954e-06, "loss": 0.91974139, "num_input_tokens_seen": 35498635, "step": 1650, "time_per_iteration": 2.7438554763793945 }, { "auxiliary_loss_clip": 0.01033978, "auxiliary_loss_mlp": 0.00758727, "balance_loss_clip": 1.02943921, "balance_loss_mlp": 0.9997822, "epoch": 0.09926349015481738, "flos": 71216428464000.0, "grad_norm": 0.8036050505402587, "language_loss": 0.63734978, "learning_rate": 3.94988677085425e-06, "loss": 0.65527683, "num_input_tokens_seen": 35565720, "step": 1651, "time_per_iteration": 3.40269136428833 }, { "auxiliary_loss_clip": 0.01170347, "auxiliary_loss_mlp": 0.01062486, "balance_loss_clip": 1.05790281, "balance_loss_mlp": 1.03842974, "epoch": 0.09932361340748534, "flos": 23148700917120.0, "grad_norm": 1.9744130417114842, "language_loss": 0.88115525, "learning_rate": 3.949800096914643e-06, "loss": 0.90348363, "num_input_tokens_seen": 35586000, "step": 1652, "time_per_iteration": 2.6695117950439453 }, { "auxiliary_loss_clip": 0.0116773, "auxiliary_loss_mlp": 0.01062073, "balance_loss_clip": 1.06095552, "balance_loss_mlp": 1.03895831, "epoch": 0.09938373666015332, "flos": 19828651847040.0, "grad_norm": 2.166773052437996, "language_loss": 0.81789082, "learning_rate": 3.949713349038422e-06, "loss": 0.84018886, "num_input_tokens_seen": 35604355, "step": 1653, "time_per_iteration": 2.7136831283569336 }, { "auxiliary_loss_clip": 0.01173152, "auxiliary_loss_mlp": 0.00780466, "balance_loss_clip": 1.05683279, "balance_loss_mlp": 1.00016594, "epoch": 0.09944385991282129, "flos": 22090664880000.0, "grad_norm": 1.662037391605293, "language_loss": 0.79489207, "learning_rate": 3.949626527228875e-06, "loss": 0.81442821, "num_input_tokens_seen": 35625495, "step": 1654, "time_per_iteration": 2.645875930786133 }, { "auxiliary_loss_clip": 0.01187918, "auxiliary_loss_mlp": 0.01056849, "balance_loss_clip": 1.06405056, "balance_loss_mlp": 1.03561759, "epoch": 0.09950398316548925, "flos": 19828867328640.0, "grad_norm": 1.7263610037420916, "language_loss": 0.81038272, "learning_rate": 3.949539631489295e-06, "loss": 0.83283037, "num_input_tokens_seen": 35645030, "step": 1655, "time_per_iteration": 2.630404233932495 }, { "auxiliary_loss_clip": 0.01181205, "auxiliary_loss_mlp": 0.01055977, "balance_loss_clip": 1.05679035, "balance_loss_mlp": 1.03294599, "epoch": 0.09956410641815722, "flos": 25003701964800.0, "grad_norm": 2.426795421082641, "language_loss": 0.80429518, "learning_rate": 3.9494526618229765e-06, "loss": 0.82666701, "num_input_tokens_seen": 35664305, "step": 1656, "time_per_iteration": 2.6283950805664062 }, { "auxiliary_loss_clip": 0.01170003, "auxiliary_loss_mlp": 0.01061881, "balance_loss_clip": 1.05787742, "balance_loss_mlp": 1.03870714, "epoch": 0.0996242296708252, "flos": 19317714837120.0, "grad_norm": 1.4960238412267362, "language_loss": 0.89040691, "learning_rate": 3.949365618233217e-06, "loss": 0.91272575, "num_input_tokens_seen": 35684060, "step": 1657, "time_per_iteration": 2.653674602508545 }, { "auxiliary_loss_clip": 0.01165842, "auxiliary_loss_mlp": 0.01057352, "balance_loss_clip": 1.05830753, "balance_loss_mlp": 1.0329144, "epoch": 0.09968435292349316, "flos": 21871609787520.0, "grad_norm": 2.1866084372248062, "language_loss": 0.84684521, "learning_rate": 3.9492785007233195e-06, "loss": 0.86907715, "num_input_tokens_seen": 35703250, "step": 1658, "time_per_iteration": 2.6897473335266113 }, { "auxiliary_loss_clip": 0.01069806, "auxiliary_loss_mlp": 0.01015844, "balance_loss_clip": 1.02042234, "balance_loss_mlp": 1.01292348, "epoch": 0.09974447617616113, "flos": 65384533313280.0, "grad_norm": 0.9123227767672076, "language_loss": 0.60828507, "learning_rate": 3.949191309296585e-06, "loss": 0.62914157, "num_input_tokens_seen": 35762165, "step": 1659, "time_per_iteration": 3.273890495300293 }, { "auxiliary_loss_clip": 0.01152432, "auxiliary_loss_mlp": 0.01051829, "balance_loss_clip": 1.05082798, "balance_loss_mlp": 1.02814245, "epoch": 0.0998045994288291, "flos": 23659817495040.0, "grad_norm": 1.9344290476513741, "language_loss": 0.84892076, "learning_rate": 3.949104043956321e-06, "loss": 0.87096334, "num_input_tokens_seen": 35781520, "step": 1660, "time_per_iteration": 2.788018226623535 }, { "auxiliary_loss_clip": 0.01149163, "auxiliary_loss_mlp": 0.01060092, "balance_loss_clip": 1.05374026, "balance_loss_mlp": 1.03514171, "epoch": 0.09986472268149707, "flos": 19609704495360.0, "grad_norm": 1.9493882663610318, "language_loss": 0.80024737, "learning_rate": 3.949016704705836e-06, "loss": 0.82234001, "num_input_tokens_seen": 35799565, "step": 1661, "time_per_iteration": 2.6537399291992188 }, { "auxiliary_loss_clip": 0.01172787, "auxiliary_loss_mlp": 0.01055532, "balance_loss_clip": 1.05715156, "balance_loss_mlp": 1.03153503, "epoch": 0.09992484593416504, "flos": 26213317395840.0, "grad_norm": 2.0152235709188377, "language_loss": 0.83560598, "learning_rate": 3.948929291548443e-06, "loss": 0.85788912, "num_input_tokens_seen": 35821085, "step": 1662, "time_per_iteration": 2.753807783126831 }, { "auxiliary_loss_clip": 0.01154838, "auxiliary_loss_mlp": 0.01061466, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03616929, "epoch": 0.09998496918683301, "flos": 17493632421120.0, "grad_norm": 1.9355779644050557, "language_loss": 0.88865256, "learning_rate": 3.9488418044874546e-06, "loss": 0.91081554, "num_input_tokens_seen": 35839840, "step": 1663, "time_per_iteration": 2.6829047203063965 }, { "auxiliary_loss_clip": 0.0118246, "auxiliary_loss_mlp": 0.01061692, "balance_loss_clip": 1.06228638, "balance_loss_mlp": 1.03825521, "epoch": 0.10004509243950098, "flos": 22784925928320.0, "grad_norm": 1.7925330820671084, "language_loss": 0.70140731, "learning_rate": 3.948754243526191e-06, "loss": 0.72384882, "num_input_tokens_seen": 35861545, "step": 1664, "time_per_iteration": 2.809300184249878 }, { "auxiliary_loss_clip": 0.01142878, "auxiliary_loss_mlp": 0.01055306, "balance_loss_clip": 1.05475903, "balance_loss_mlp": 1.03312087, "epoch": 0.10010521569216894, "flos": 16253385667200.0, "grad_norm": 2.4978474602303895, "language_loss": 0.78981555, "learning_rate": 3.94866660866797e-06, "loss": 0.81179744, "num_input_tokens_seen": 35878295, "step": 1665, "time_per_iteration": 2.7010488510131836 }, { "auxiliary_loss_clip": 0.01175861, "auxiliary_loss_mlp": 0.01070341, "balance_loss_clip": 1.06286561, "balance_loss_mlp": 1.04742861, "epoch": 0.10016533894483691, "flos": 23402589223680.0, "grad_norm": 3.1438625724360945, "language_loss": 0.70054829, "learning_rate": 3.9485788999161165e-06, "loss": 0.7230103, "num_input_tokens_seen": 35898990, "step": 1666, "time_per_iteration": 2.689879894256592 }, { "auxiliary_loss_clip": 0.01110848, "auxiliary_loss_mlp": 0.01074593, "balance_loss_clip": 1.05082703, "balance_loss_mlp": 1.04946339, "epoch": 0.10022546219750489, "flos": 19354164163200.0, "grad_norm": 1.7583449522195267, "language_loss": 0.78647351, "learning_rate": 3.948491117273956e-06, "loss": 0.80832791, "num_input_tokens_seen": 35916225, "step": 1667, "time_per_iteration": 2.8973352909088135 }, { "auxiliary_loss_clip": 0.01153352, "auxiliary_loss_mlp": 0.01062819, "balance_loss_clip": 1.05452693, "balance_loss_mlp": 1.03752255, "epoch": 0.10028558545017285, "flos": 27085766837760.0, "grad_norm": 2.4011089045072187, "language_loss": 0.77357388, "learning_rate": 3.948403260744817e-06, "loss": 0.7957356, "num_input_tokens_seen": 35934630, "step": 1668, "time_per_iteration": 3.2600321769714355 }, { "auxiliary_loss_clip": 0.01184879, "auxiliary_loss_mlp": 0.01059367, "balance_loss_clip": 1.05833495, "balance_loss_mlp": 1.03523922, "epoch": 0.10034570870284082, "flos": 25847136195840.0, "grad_norm": 1.7407668002390366, "language_loss": 0.77520061, "learning_rate": 3.948315330332031e-06, "loss": 0.79764307, "num_input_tokens_seen": 35953855, "step": 1669, "time_per_iteration": 2.6899471282958984 }, { "auxiliary_loss_clip": 0.0118887, "auxiliary_loss_mlp": 0.01067842, "balance_loss_clip": 1.05948365, "balance_loss_mlp": 1.04416728, "epoch": 0.1004058319555088, "flos": 26249587153920.0, "grad_norm": 5.441134829238958, "language_loss": 0.85160148, "learning_rate": 3.948227326038933e-06, "loss": 0.87416857, "num_input_tokens_seen": 35974555, "step": 1670, "time_per_iteration": 2.616867780685425 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01055607, "balance_loss_clip": 1.05584121, "balance_loss_mlp": 1.03354108, "epoch": 0.10046595520817676, "flos": 25374480105600.0, "grad_norm": 1.4849262119454174, "language_loss": 0.76836258, "learning_rate": 3.9481392478688586e-06, "loss": 0.79068166, "num_input_tokens_seen": 35996830, "step": 1671, "time_per_iteration": 2.658254384994507 }, { "auxiliary_loss_clip": 0.01061447, "auxiliary_loss_mlp": 0.01017561, "balance_loss_clip": 1.02178144, "balance_loss_mlp": 1.01454473, "epoch": 0.10052607846084473, "flos": 67461821677440.0, "grad_norm": 0.7781454358921105, "language_loss": 0.60718858, "learning_rate": 3.948051095825149e-06, "loss": 0.62797856, "num_input_tokens_seen": 36054465, "step": 1672, "time_per_iteration": 3.1269097328186035 }, { "auxiliary_loss_clip": 0.01143177, "auxiliary_loss_mlp": 0.01063346, "balance_loss_clip": 1.05112922, "balance_loss_mlp": 1.04055333, "epoch": 0.10058620171351271, "flos": 21360493209600.0, "grad_norm": 2.433278134910662, "language_loss": 0.7711426, "learning_rate": 3.947962869911147e-06, "loss": 0.79320776, "num_input_tokens_seen": 36073480, "step": 1673, "time_per_iteration": 2.6931638717651367 }, { "auxiliary_loss_clip": 0.01132094, "auxiliary_loss_mlp": 0.01056611, "balance_loss_clip": 1.04989302, "balance_loss_mlp": 1.03262639, "epoch": 0.10064632496618067, "flos": 16800125558400.0, "grad_norm": 2.074683072839241, "language_loss": 0.73173523, "learning_rate": 3.947874570130197e-06, "loss": 0.75362229, "num_input_tokens_seen": 36091830, "step": 1674, "time_per_iteration": 2.7188127040863037 }, { "auxiliary_loss_clip": 0.01172389, "auxiliary_loss_mlp": 0.00779533, "balance_loss_clip": 1.0556165, "balance_loss_mlp": 1.00024796, "epoch": 0.10070644821884864, "flos": 23624445576960.0, "grad_norm": 2.1982379565146872, "language_loss": 0.79456973, "learning_rate": 3.947786196485649e-06, "loss": 0.81408894, "num_input_tokens_seen": 36111400, "step": 1675, "time_per_iteration": 2.712090253829956 }, { "auxiliary_loss_clip": 0.01182659, "auxiliary_loss_mlp": 0.01063327, "balance_loss_clip": 1.05801332, "balance_loss_mlp": 1.04239404, "epoch": 0.1007665714715166, "flos": 24462564595200.0, "grad_norm": 2.408955682155161, "language_loss": 0.8120935, "learning_rate": 3.947697748980853e-06, "loss": 0.83455336, "num_input_tokens_seen": 36129345, "step": 1676, "time_per_iteration": 2.685472249984741 }, { "auxiliary_loss_clip": 0.01175397, "auxiliary_loss_mlp": 0.01057105, "balance_loss_clip": 1.05950332, "balance_loss_mlp": 1.03546858, "epoch": 0.10082669472418458, "flos": 16799119977600.0, "grad_norm": 2.008035557658629, "language_loss": 0.86132157, "learning_rate": 3.947609227619163e-06, "loss": 0.88364655, "num_input_tokens_seen": 36146255, "step": 1677, "time_per_iteration": 2.6589157581329346 }, { "auxiliary_loss_clip": 0.01162997, "auxiliary_loss_mlp": 0.010508, "balance_loss_clip": 1.05363441, "balance_loss_mlp": 1.02896047, "epoch": 0.10088681797685255, "flos": 13553513844480.0, "grad_norm": 2.160847391025828, "language_loss": 0.86006588, "learning_rate": 3.947520632403936e-06, "loss": 0.88220382, "num_input_tokens_seen": 36164050, "step": 1678, "time_per_iteration": 2.694347858428955 }, { "auxiliary_loss_clip": 0.0116292, "auxiliary_loss_mlp": 0.01056376, "balance_loss_clip": 1.0587275, "balance_loss_mlp": 1.03406048, "epoch": 0.10094694122952051, "flos": 25265706744960.0, "grad_norm": 12.700254532531051, "language_loss": 0.89978886, "learning_rate": 3.947431963338532e-06, "loss": 0.92198181, "num_input_tokens_seen": 36183530, "step": 1679, "time_per_iteration": 2.6741397380828857 }, { "auxiliary_loss_clip": 0.01071086, "auxiliary_loss_mlp": 0.0101685, "balance_loss_clip": 1.02328789, "balance_loss_mlp": 1.01360798, "epoch": 0.10100706448218849, "flos": 69854299885440.0, "grad_norm": 0.7882499243548835, "language_loss": 0.52985126, "learning_rate": 3.947343220426312e-06, "loss": 0.55073065, "num_input_tokens_seen": 36248550, "step": 1680, "time_per_iteration": 3.169893503189087 }, { "auxiliary_loss_clip": 0.01185252, "auxiliary_loss_mlp": 0.00779951, "balance_loss_clip": 1.06022644, "balance_loss_mlp": 1.00017488, "epoch": 0.10106718773485646, "flos": 20007163463040.0, "grad_norm": 1.6642182724084642, "language_loss": 0.76869059, "learning_rate": 3.947254403670641e-06, "loss": 0.7883426, "num_input_tokens_seen": 36266065, "step": 1681, "time_per_iteration": 4.146950006484985 }, { "auxiliary_loss_clip": 0.01156046, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.0539515, "balance_loss_mlp": 1.03469992, "epoch": 0.10112731098752442, "flos": 13479825093120.0, "grad_norm": 2.3884003317971225, "language_loss": 0.93957508, "learning_rate": 3.947165513074889e-06, "loss": 0.96173531, "num_input_tokens_seen": 36280960, "step": 1682, "time_per_iteration": 4.220505237579346 }, { "auxiliary_loss_clip": 0.01173183, "auxiliary_loss_mlp": 0.01053261, "balance_loss_clip": 1.05487084, "balance_loss_mlp": 1.03133821, "epoch": 0.1011874342401924, "flos": 18515901490560.0, "grad_norm": 3.5300660189263917, "language_loss": 0.87618893, "learning_rate": 3.947076548642425e-06, "loss": 0.89845335, "num_input_tokens_seen": 36299010, "step": 1683, "time_per_iteration": 2.635636329650879 }, { "auxiliary_loss_clip": 0.01128888, "auxiliary_loss_mlp": 0.01063089, "balance_loss_clip": 1.04814756, "balance_loss_mlp": 1.04008126, "epoch": 0.10124755749286037, "flos": 20702861055360.0, "grad_norm": 2.3337760241024923, "language_loss": 0.74566805, "learning_rate": 3.946987510376624e-06, "loss": 0.76758784, "num_input_tokens_seen": 36318400, "step": 1684, "time_per_iteration": 4.417364835739136 }, { "auxiliary_loss_clip": 0.01053031, "auxiliary_loss_mlp": 0.0101182, "balance_loss_clip": 1.02547038, "balance_loss_mlp": 1.00853014, "epoch": 0.10130768074552833, "flos": 56109456247680.0, "grad_norm": 0.7564631726021327, "language_loss": 0.61085057, "learning_rate": 3.9468983982808615e-06, "loss": 0.63149905, "num_input_tokens_seen": 36381815, "step": 1685, "time_per_iteration": 4.87179970741272 }, { "auxiliary_loss_clip": 0.01157045, "auxiliary_loss_mlp": 0.01056064, "balance_loss_clip": 1.05233479, "balance_loss_mlp": 1.0341655, "epoch": 0.1013678039981963, "flos": 33402346156800.0, "grad_norm": 4.297801792672815, "language_loss": 0.61381406, "learning_rate": 3.946809212358516e-06, "loss": 0.6359452, "num_input_tokens_seen": 36404320, "step": 1686, "time_per_iteration": 2.8289108276367188 }, { "auxiliary_loss_clip": 0.01144631, "auxiliary_loss_mlp": 0.01059888, "balance_loss_clip": 1.05645001, "balance_loss_mlp": 1.03678524, "epoch": 0.10142792725086427, "flos": 31905338008320.0, "grad_norm": 2.21923850158845, "language_loss": 0.81216162, "learning_rate": 3.946719952612972e-06, "loss": 0.83420682, "num_input_tokens_seen": 36427510, "step": 1687, "time_per_iteration": 2.947535276412964 }, { "auxiliary_loss_clip": 0.0117612, "auxiliary_loss_mlp": 0.0105614, "balance_loss_clip": 1.05933213, "balance_loss_mlp": 1.03403926, "epoch": 0.10148805050353224, "flos": 28475905046400.0, "grad_norm": 1.7955898786084035, "language_loss": 0.71943259, "learning_rate": 3.94663061904761e-06, "loss": 0.74175525, "num_input_tokens_seen": 36448230, "step": 1688, "time_per_iteration": 2.693249225616455 }, { "auxiliary_loss_clip": 0.01151953, "auxiliary_loss_mlp": 0.01063362, "balance_loss_clip": 1.05288756, "balance_loss_mlp": 1.04079556, "epoch": 0.1015481737562002, "flos": 25148888737920.0, "grad_norm": 2.636795901714516, "language_loss": 0.86876953, "learning_rate": 3.94654121166582e-06, "loss": 0.89092261, "num_input_tokens_seen": 36464395, "step": 1689, "time_per_iteration": 2.677992820739746 }, { "auxiliary_loss_clip": 0.01172188, "auxiliary_loss_mlp": 0.01057982, "balance_loss_clip": 1.05476904, "balance_loss_mlp": 1.0378834, "epoch": 0.10160829700886818, "flos": 30882781630080.0, "grad_norm": 2.2211105929909696, "language_loss": 0.88170946, "learning_rate": 3.946451730470993e-06, "loss": 0.90401113, "num_input_tokens_seen": 36486475, "step": 1690, "time_per_iteration": 2.707209348678589 }, { "auxiliary_loss_clip": 0.01158767, "auxiliary_loss_mlp": 0.01052386, "balance_loss_clip": 1.05507553, "balance_loss_mlp": 1.02973664, "epoch": 0.10166842026153615, "flos": 20412020632320.0, "grad_norm": 2.08291471600754, "language_loss": 0.83348423, "learning_rate": 3.946362175466521e-06, "loss": 0.85559577, "num_input_tokens_seen": 36505310, "step": 1691, "time_per_iteration": 2.6521170139312744 }, { "auxiliary_loss_clip": 0.01162159, "auxiliary_loss_mlp": 0.01051716, "balance_loss_clip": 1.05550599, "balance_loss_mlp": 1.03016281, "epoch": 0.10172854351420411, "flos": 33476968661760.0, "grad_norm": 1.704519528530946, "language_loss": 0.66773653, "learning_rate": 3.946272546655801e-06, "loss": 0.68987525, "num_input_tokens_seen": 36529820, "step": 1692, "time_per_iteration": 2.799353837966919 }, { "auxiliary_loss_clip": 0.01144502, "auxiliary_loss_mlp": 0.0107473, "balance_loss_clip": 1.05057836, "balance_loss_mlp": 1.05258095, "epoch": 0.1017886667668721, "flos": 23550325862400.0, "grad_norm": 1.8345924563029705, "language_loss": 0.75939322, "learning_rate": 3.94618284404223e-06, "loss": 0.78158557, "num_input_tokens_seen": 36549000, "step": 1693, "time_per_iteration": 2.6711113452911377 }, { "auxiliary_loss_clip": 0.01132621, "auxiliary_loss_mlp": 0.01057162, "balance_loss_clip": 1.04893303, "balance_loss_mlp": 1.03289056, "epoch": 0.10184879001954006, "flos": 23296078419840.0, "grad_norm": 1.7027745569702395, "language_loss": 0.87503564, "learning_rate": 3.9460930676292105e-06, "loss": 0.89693356, "num_input_tokens_seen": 36567515, "step": 1694, "time_per_iteration": 2.749119520187378 }, { "auxiliary_loss_clip": 0.01130673, "auxiliary_loss_mlp": 0.01058451, "balance_loss_clip": 1.04954553, "balance_loss_mlp": 1.033095, "epoch": 0.10190891327220802, "flos": 18333116156160.0, "grad_norm": 1.7649462193878245, "language_loss": 0.79299057, "learning_rate": 3.946003217420147e-06, "loss": 0.8148818, "num_input_tokens_seen": 36586190, "step": 1695, "time_per_iteration": 2.839081048965454 }, { "auxiliary_loss_clip": 0.0112732, "auxiliary_loss_mlp": 0.01061103, "balance_loss_clip": 1.04818296, "balance_loss_mlp": 1.03772628, "epoch": 0.10196903652487599, "flos": 26465374108800.0, "grad_norm": 2.7190993931598446, "language_loss": 0.86494684, "learning_rate": 3.945913293418447e-06, "loss": 0.88683105, "num_input_tokens_seen": 36607495, "step": 1696, "time_per_iteration": 2.7802348136901855 }, { "auxiliary_loss_clip": 0.01168675, "auxiliary_loss_mlp": 0.01054661, "balance_loss_clip": 1.05711746, "balance_loss_mlp": 1.03315568, "epoch": 0.10202915977754397, "flos": 21869526798720.0, "grad_norm": 1.7889048836535288, "language_loss": 0.82350796, "learning_rate": 3.945823295627519e-06, "loss": 0.84574133, "num_input_tokens_seen": 36628555, "step": 1697, "time_per_iteration": 2.667962074279785 }, { "auxiliary_loss_clip": 0.01184333, "auxiliary_loss_mlp": 0.01055548, "balance_loss_clip": 1.05680871, "balance_loss_mlp": 1.033149, "epoch": 0.10208928303021193, "flos": 22309755886080.0, "grad_norm": 2.0464291543972006, "language_loss": 0.81198204, "learning_rate": 3.9457332240507775e-06, "loss": 0.83438087, "num_input_tokens_seen": 36646250, "step": 1698, "time_per_iteration": 2.6484432220458984 }, { "auxiliary_loss_clip": 0.01150498, "auxiliary_loss_mlp": 0.01053546, "balance_loss_clip": 1.05696845, "balance_loss_mlp": 1.03226686, "epoch": 0.1021494062828799, "flos": 22125569921280.0, "grad_norm": 2.3020250981163226, "language_loss": 0.75612724, "learning_rate": 3.945643078691637e-06, "loss": 0.77816761, "num_input_tokens_seen": 36666675, "step": 1699, "time_per_iteration": 2.8040614128112793 }, { "auxiliary_loss_clip": 0.01162088, "auxiliary_loss_mlp": 0.01050379, "balance_loss_clip": 1.06041551, "balance_loss_mlp": 1.02827764, "epoch": 0.10220952953554788, "flos": 19646728439040.0, "grad_norm": 1.6839869206777538, "language_loss": 0.80395639, "learning_rate": 3.945552859553516e-06, "loss": 0.8260811, "num_input_tokens_seen": 36685225, "step": 1700, "time_per_iteration": 2.6701290607452393 }, { "auxiliary_loss_clip": 0.0117076, "auxiliary_loss_mlp": 0.0104804, "balance_loss_clip": 1.05714083, "balance_loss_mlp": 1.02653444, "epoch": 0.10226965278821584, "flos": 29787290686080.0, "grad_norm": 2.102621975458346, "language_loss": 0.76877582, "learning_rate": 3.945462566639836e-06, "loss": 0.79096377, "num_input_tokens_seen": 36705985, "step": 1701, "time_per_iteration": 2.748201847076416 }, { "auxiliary_loss_clip": 0.01182259, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.06157088, "balance_loss_mlp": 1.02852523, "epoch": 0.10232977604088381, "flos": 27016818681600.0, "grad_norm": 2.1099726588763965, "language_loss": 0.77922845, "learning_rate": 3.945372199954019e-06, "loss": 0.80155474, "num_input_tokens_seen": 36725815, "step": 1702, "time_per_iteration": 2.6703274250030518 }, { "auxiliary_loss_clip": 0.01156323, "auxiliary_loss_mlp": 0.01052524, "balance_loss_clip": 1.05596721, "balance_loss_mlp": 1.03126872, "epoch": 0.10238989929355179, "flos": 20777519473920.0, "grad_norm": 2.2326457826946293, "language_loss": 0.94093609, "learning_rate": 3.945281759499494e-06, "loss": 0.96302462, "num_input_tokens_seen": 36742345, "step": 1703, "time_per_iteration": 2.6712698936462402 }, { "auxiliary_loss_clip": 0.01034483, "auxiliary_loss_mlp": 0.01037784, "balance_loss_clip": 1.02765131, "balance_loss_mlp": 1.03315914, "epoch": 0.10245002254621975, "flos": 57698322451200.0, "grad_norm": 0.8815387598011586, "language_loss": 0.55096036, "learning_rate": 3.94519124527969e-06, "loss": 0.57168299, "num_input_tokens_seen": 36798775, "step": 1704, "time_per_iteration": 3.2863855361938477 }, { "auxiliary_loss_clip": 0.01186822, "auxiliary_loss_mlp": 0.01053701, "balance_loss_clip": 1.06026638, "balance_loss_mlp": 1.03088403, "epoch": 0.10251014579888772, "flos": 16800125558400.0, "grad_norm": 2.051901555713709, "language_loss": 0.84025991, "learning_rate": 3.945100657298039e-06, "loss": 0.86266518, "num_input_tokens_seen": 36816295, "step": 1705, "time_per_iteration": 2.8991851806640625 }, { "auxiliary_loss_clip": 0.01045354, "auxiliary_loss_mlp": 0.01018361, "balance_loss_clip": 1.02622223, "balance_loss_mlp": 1.01526153, "epoch": 0.1025702690515557, "flos": 68565500922240.0, "grad_norm": 0.7692746082941451, "language_loss": 0.60408181, "learning_rate": 3.9450099955579765e-06, "loss": 0.62471896, "num_input_tokens_seen": 36882030, "step": 1706, "time_per_iteration": 3.2174558639526367 }, { "auxiliary_loss_clip": 0.01149922, "auxiliary_loss_mlp": 0.01051211, "balance_loss_clip": 1.05388391, "balance_loss_mlp": 1.02812052, "epoch": 0.10263039230422366, "flos": 14866623336960.0, "grad_norm": 2.201796189576969, "language_loss": 0.85937822, "learning_rate": 3.94491926006294e-06, "loss": 0.88138962, "num_input_tokens_seen": 36899245, "step": 1707, "time_per_iteration": 2.689208507537842 }, { "auxiliary_loss_clip": 0.01169165, "auxiliary_loss_mlp": 0.0105297, "balance_loss_clip": 1.05941081, "balance_loss_mlp": 1.03114319, "epoch": 0.10269051555689163, "flos": 25337599816320.0, "grad_norm": 1.471109036018689, "language_loss": 0.73299325, "learning_rate": 3.944828450816369e-06, "loss": 0.75521457, "num_input_tokens_seen": 36920950, "step": 1708, "time_per_iteration": 2.679760456085205 }, { "auxiliary_loss_clip": 0.01155833, "auxiliary_loss_mlp": 0.00780571, "balance_loss_clip": 1.05718231, "balance_loss_mlp": 1.00042295, "epoch": 0.10275063880955959, "flos": 21068826773760.0, "grad_norm": 1.7051644476897239, "language_loss": 0.91616452, "learning_rate": 3.944737567821709e-06, "loss": 0.93552846, "num_input_tokens_seen": 36938900, "step": 1709, "time_per_iteration": 2.6754679679870605 }, { "auxiliary_loss_clip": 0.01124911, "auxiliary_loss_mlp": 0.01057008, "balance_loss_clip": 1.05144072, "balance_loss_mlp": 1.0343945, "epoch": 0.10281076206222757, "flos": 30366780802560.0, "grad_norm": 2.1056252966717275, "language_loss": 0.88004494, "learning_rate": 3.944646611082406e-06, "loss": 0.90186411, "num_input_tokens_seen": 36957010, "step": 1710, "time_per_iteration": 2.708723306655884 }, { "auxiliary_loss_clip": 0.01171004, "auxiliary_loss_mlp": 0.0105967, "balance_loss_clip": 1.05658317, "balance_loss_mlp": 1.036973, "epoch": 0.10287088531489554, "flos": 22418313765120.0, "grad_norm": 1.7046493271202992, "language_loss": 0.79370153, "learning_rate": 3.944555580601908e-06, "loss": 0.81600821, "num_input_tokens_seen": 36977690, "step": 1711, "time_per_iteration": 2.631908416748047 }, { "auxiliary_loss_clip": 0.01156003, "auxiliary_loss_mlp": 0.01055126, "balance_loss_clip": 1.05841637, "balance_loss_mlp": 1.03189242, "epoch": 0.1029310085675635, "flos": 25115994858240.0, "grad_norm": 3.2168061349371135, "language_loss": 0.73666596, "learning_rate": 3.944464476383668e-06, "loss": 0.75877726, "num_input_tokens_seen": 36997300, "step": 1712, "time_per_iteration": 2.7107467651367188 }, { "auxiliary_loss_clip": 0.01133407, "auxiliary_loss_mlp": 0.01056055, "balance_loss_clip": 1.05496907, "balance_loss_mlp": 1.03334546, "epoch": 0.10299113182023148, "flos": 19865639877120.0, "grad_norm": 1.974447377126898, "language_loss": 0.87049067, "learning_rate": 3.94437329843114e-06, "loss": 0.89238536, "num_input_tokens_seen": 37016110, "step": 1713, "time_per_iteration": 2.6532411575317383 }, { "auxiliary_loss_clip": 0.0116832, "auxiliary_loss_mlp": 0.01060237, "balance_loss_clip": 1.05669498, "balance_loss_mlp": 1.03877962, "epoch": 0.10305125507289944, "flos": 20447608032000.0, "grad_norm": 1.57388574383124, "language_loss": 0.72406238, "learning_rate": 3.944282046747782e-06, "loss": 0.74634796, "num_input_tokens_seen": 37036405, "step": 1714, "time_per_iteration": 2.5987610816955566 }, { "auxiliary_loss_clip": 0.01174482, "auxiliary_loss_mlp": 0.01063165, "balance_loss_clip": 1.05715692, "balance_loss_mlp": 1.03934693, "epoch": 0.10311137832556741, "flos": 26250772302720.0, "grad_norm": 2.1959530175190434, "language_loss": 0.91065919, "learning_rate": 3.944190721337053e-06, "loss": 0.93303567, "num_input_tokens_seen": 37057580, "step": 1715, "time_per_iteration": 2.743833303451538 }, { "auxiliary_loss_clip": 0.01170297, "auxiliary_loss_mlp": 0.01054891, "balance_loss_clip": 1.05448914, "balance_loss_mlp": 1.03305221, "epoch": 0.10317150157823539, "flos": 35298932175360.0, "grad_norm": 1.8741123562687005, "language_loss": 0.75969976, "learning_rate": 3.944099322202418e-06, "loss": 0.78195167, "num_input_tokens_seen": 37079120, "step": 1716, "time_per_iteration": 2.748903274536133 }, { "auxiliary_loss_clip": 0.01162664, "auxiliary_loss_mlp": 0.01061895, "balance_loss_clip": 1.05617428, "balance_loss_mlp": 1.03804111, "epoch": 0.10323162483090335, "flos": 25739943033600.0, "grad_norm": 3.178190042364093, "language_loss": 0.85308528, "learning_rate": 3.944007849347342e-06, "loss": 0.87533092, "num_input_tokens_seen": 37099710, "step": 1717, "time_per_iteration": 2.690772533416748 }, { "auxiliary_loss_clip": 0.01127019, "auxiliary_loss_mlp": 0.01067935, "balance_loss_clip": 1.05048633, "balance_loss_mlp": 1.04436755, "epoch": 0.10329174808357132, "flos": 16289870906880.0, "grad_norm": 1.8474438265561113, "language_loss": 0.82945001, "learning_rate": 3.943916302775292e-06, "loss": 0.85139954, "num_input_tokens_seen": 37117775, "step": 1718, "time_per_iteration": 2.7029476165771484 }, { "auxiliary_loss_clip": 0.01171184, "auxiliary_loss_mlp": 0.01049869, "balance_loss_clip": 1.05912328, "balance_loss_mlp": 1.02701616, "epoch": 0.10335187133623928, "flos": 36687166963200.0, "grad_norm": 1.7728224248964342, "language_loss": 0.73396438, "learning_rate": 3.943824682489742e-06, "loss": 0.75617492, "num_input_tokens_seen": 37140280, "step": 1719, "time_per_iteration": 2.7653820514678955 }, { "auxiliary_loss_clip": 0.01168859, "auxiliary_loss_mlp": 0.01048444, "balance_loss_clip": 1.05861163, "balance_loss_mlp": 1.02786827, "epoch": 0.10341199458890726, "flos": 14975648092800.0, "grad_norm": 1.7819459058763836, "language_loss": 0.92692196, "learning_rate": 3.9437329884941665e-06, "loss": 0.94909501, "num_input_tokens_seen": 37158350, "step": 1720, "time_per_iteration": 4.1962480545043945 }, { "auxiliary_loss_clip": 0.01139894, "auxiliary_loss_mlp": 0.01051033, "balance_loss_clip": 1.05092323, "balance_loss_mlp": 1.02827597, "epoch": 0.10347211784157523, "flos": 21031587348480.0, "grad_norm": 1.6861044154168399, "language_loss": 0.79497123, "learning_rate": 3.943641220792039e-06, "loss": 0.81688046, "num_input_tokens_seen": 37177120, "step": 1721, "time_per_iteration": 4.524151802062988 }, { "auxiliary_loss_clip": 0.01130482, "auxiliary_loss_mlp": 0.01067754, "balance_loss_clip": 1.05380797, "balance_loss_mlp": 1.04109859, "epoch": 0.1035322410942432, "flos": 19792094780160.0, "grad_norm": 1.951940775381607, "language_loss": 0.80707669, "learning_rate": 3.9435493793868434e-06, "loss": 0.829059, "num_input_tokens_seen": 37195895, "step": 1722, "time_per_iteration": 2.7972562313079834 }, { "auxiliary_loss_clip": 0.01059018, "auxiliary_loss_mlp": 0.01038991, "balance_loss_clip": 1.02668202, "balance_loss_mlp": 1.03536737, "epoch": 0.10359236434691117, "flos": 52698874947840.0, "grad_norm": 0.9413879826908518, "language_loss": 0.67161834, "learning_rate": 3.943457464282059e-06, "loss": 0.69259846, "num_input_tokens_seen": 37247270, "step": 1723, "time_per_iteration": 4.899553060531616 }, { "auxiliary_loss_clip": 0.01169875, "auxiliary_loss_mlp": 0.01062977, "balance_loss_clip": 1.05482125, "balance_loss_mlp": 1.04193664, "epoch": 0.10365248759957914, "flos": 18405404277120.0, "grad_norm": 2.8641520576523116, "language_loss": 0.77715755, "learning_rate": 3.9433654754811745e-06, "loss": 0.7994861, "num_input_tokens_seen": 37265595, "step": 1724, "time_per_iteration": 2.7613437175750732 }, { "auxiliary_loss_clip": 0.01151829, "auxiliary_loss_mlp": 0.01069246, "balance_loss_clip": 1.05667496, "balance_loss_mlp": 1.04753852, "epoch": 0.1037126108522471, "flos": 47553555335040.0, "grad_norm": 2.6433978354033543, "language_loss": 0.74533165, "learning_rate": 3.943273412987676e-06, "loss": 0.76754242, "num_input_tokens_seen": 37286660, "step": 1725, "time_per_iteration": 4.557274580001831 }, { "auxiliary_loss_clip": 0.01137065, "auxiliary_loss_mlp": 0.01081067, "balance_loss_clip": 1.05264461, "balance_loss_mlp": 1.05832207, "epoch": 0.10377273410491508, "flos": 22816670572800.0, "grad_norm": 2.2241153649877865, "language_loss": 0.75043738, "learning_rate": 3.943181276805054e-06, "loss": 0.77261865, "num_input_tokens_seen": 37304915, "step": 1726, "time_per_iteration": 2.7098495960235596 }, { "auxiliary_loss_clip": 0.01150932, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.05345368, "balance_loss_mlp": 1.05610991, "epoch": 0.10383285735758305, "flos": 26138694890880.0, "grad_norm": 2.783771441956431, "language_loss": 0.73243797, "learning_rate": 3.9430890669368035e-06, "loss": 0.75473368, "num_input_tokens_seen": 37325265, "step": 1727, "time_per_iteration": 2.74774169921875 }, { "auxiliary_loss_clip": 0.01157922, "auxiliary_loss_mlp": 0.01068007, "balance_loss_clip": 1.05303776, "balance_loss_mlp": 1.04625082, "epoch": 0.10389298061025101, "flos": 17091791994240.0, "grad_norm": 2.172978726198527, "language_loss": 0.84373868, "learning_rate": 3.942996783386422e-06, "loss": 0.86599791, "num_input_tokens_seen": 37341650, "step": 1728, "time_per_iteration": 2.675724744796753 }, { "auxiliary_loss_clip": 0.01154897, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.0545603, "balance_loss_mlp": 1.0393219, "epoch": 0.10395310386291898, "flos": 20776513893120.0, "grad_norm": 2.1406499008555513, "language_loss": 0.70776087, "learning_rate": 3.942904426157406e-06, "loss": 0.7299149, "num_input_tokens_seen": 37360270, "step": 1729, "time_per_iteration": 2.6885008811950684 }, { "auxiliary_loss_clip": 0.01158623, "auxiliary_loss_mlp": 0.01068311, "balance_loss_clip": 1.05437422, "balance_loss_mlp": 1.04520774, "epoch": 0.10401322711558696, "flos": 12820540913280.0, "grad_norm": 2.4133379049648283, "language_loss": 0.81237471, "learning_rate": 3.9428119952532605e-06, "loss": 0.83464402, "num_input_tokens_seen": 37375225, "step": 1730, "time_per_iteration": 2.6659536361694336 }, { "auxiliary_loss_clip": 0.01085856, "auxiliary_loss_mlp": 0.01063394, "balance_loss_clip": 1.04733562, "balance_loss_mlp": 1.04314065, "epoch": 0.10407335036825492, "flos": 23184683366400.0, "grad_norm": 1.6634499611984725, "language_loss": 0.75829297, "learning_rate": 3.942719490677489e-06, "loss": 0.77978551, "num_input_tokens_seen": 37395165, "step": 1731, "time_per_iteration": 3.043125629425049 }, { "auxiliary_loss_clip": 0.01129913, "auxiliary_loss_mlp": 0.01065783, "balance_loss_clip": 1.0526607, "balance_loss_mlp": 1.04604149, "epoch": 0.10413347362092289, "flos": 26104184899200.0, "grad_norm": 1.8280179918091173, "language_loss": 0.8268069, "learning_rate": 3.9426269124336e-06, "loss": 0.84876388, "num_input_tokens_seen": 37414845, "step": 1732, "time_per_iteration": 2.96221661567688 }, { "auxiliary_loss_clip": 0.01141505, "auxiliary_loss_mlp": 0.01067805, "balance_loss_clip": 1.05805755, "balance_loss_mlp": 1.04852867, "epoch": 0.10419359687359087, "flos": 12641059630080.0, "grad_norm": 1.9919813178368582, "language_loss": 0.83320522, "learning_rate": 3.942534260525104e-06, "loss": 0.85529828, "num_input_tokens_seen": 37432490, "step": 1733, "time_per_iteration": 2.7364420890808105 }, { "auxiliary_loss_clip": 0.01153374, "auxiliary_loss_mlp": 0.0106675, "balance_loss_clip": 1.05592012, "balance_loss_mlp": 1.04654372, "epoch": 0.10425372012625883, "flos": 12125094716160.0, "grad_norm": 2.4441875881355597, "language_loss": 0.76683885, "learning_rate": 3.942441534955514e-06, "loss": 0.78904009, "num_input_tokens_seen": 37449435, "step": 1734, "time_per_iteration": 2.669623851776123 }, { "auxiliary_loss_clip": 0.0113597, "auxiliary_loss_mlp": 0.01052567, "balance_loss_clip": 1.05042601, "balance_loss_mlp": 1.03255177, "epoch": 0.1043138433789268, "flos": 25337563902720.0, "grad_norm": 1.6775801166329647, "language_loss": 0.74826896, "learning_rate": 3.9423487357283465e-06, "loss": 0.7701543, "num_input_tokens_seen": 37469105, "step": 1735, "time_per_iteration": 2.8477160930633545 }, { "auxiliary_loss_clip": 0.01167698, "auxiliary_loss_mlp": 0.01055716, "balance_loss_clip": 1.05678105, "balance_loss_mlp": 1.0344727, "epoch": 0.10437396663159478, "flos": 29167149352320.0, "grad_norm": 1.7228393064183538, "language_loss": 0.78835273, "learning_rate": 3.94225586284712e-06, "loss": 0.81058681, "num_input_tokens_seen": 37490540, "step": 1736, "time_per_iteration": 2.690453052520752 }, { "auxiliary_loss_clip": 0.0116734, "auxiliary_loss_mlp": 0.01064692, "balance_loss_clip": 1.05800533, "balance_loss_mlp": 1.04357982, "epoch": 0.10443408988426274, "flos": 25080946162560.0, "grad_norm": 1.8549131823334455, "language_loss": 0.7058785, "learning_rate": 3.942162916315356e-06, "loss": 0.72819883, "num_input_tokens_seen": 37511905, "step": 1737, "time_per_iteration": 2.6296744346618652 }, { "auxiliary_loss_clip": 0.01150138, "auxiliary_loss_mlp": 0.01059407, "balance_loss_clip": 1.04806042, "balance_loss_mlp": 1.03600669, "epoch": 0.1044942131369307, "flos": 26759662237440.0, "grad_norm": 2.415613377802324, "language_loss": 0.81624997, "learning_rate": 3.942069896136581e-06, "loss": 0.83834541, "num_input_tokens_seen": 37533635, "step": 1738, "time_per_iteration": 2.7436723709106445 }, { "auxiliary_loss_clip": 0.01181471, "auxiliary_loss_mlp": 0.01062035, "balance_loss_clip": 1.05579174, "balance_loss_mlp": 1.03950453, "epoch": 0.10455433638959867, "flos": 18442571875200.0, "grad_norm": 2.1004590024567897, "language_loss": 0.75419426, "learning_rate": 3.9419768023143196e-06, "loss": 0.77662933, "num_input_tokens_seen": 37552035, "step": 1739, "time_per_iteration": 2.585538148880005 }, { "auxiliary_loss_clip": 0.01146716, "auxiliary_loss_mlp": 0.01054893, "balance_loss_clip": 1.05417264, "balance_loss_mlp": 1.03348303, "epoch": 0.10461445964226665, "flos": 23218977876480.0, "grad_norm": 1.586314706443492, "language_loss": 0.77523744, "learning_rate": 3.941883634852104e-06, "loss": 0.79725355, "num_input_tokens_seen": 37571540, "step": 1740, "time_per_iteration": 2.8947789669036865 }, { "auxiliary_loss_clip": 0.01152077, "auxiliary_loss_mlp": 0.01049503, "balance_loss_clip": 1.05725431, "balance_loss_mlp": 1.0288676, "epoch": 0.10467458289493461, "flos": 24345243797760.0, "grad_norm": 1.964868695493703, "language_loss": 0.85976374, "learning_rate": 3.941790393753467e-06, "loss": 0.88177955, "num_input_tokens_seen": 37588265, "step": 1741, "time_per_iteration": 2.7706260681152344 }, { "auxiliary_loss_clip": 0.01158134, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.05614483, "balance_loss_mlp": 1.03350592, "epoch": 0.10473470614760258, "flos": 21287953693440.0, "grad_norm": 5.197245251055922, "language_loss": 0.75592613, "learning_rate": 3.941697079021942e-06, "loss": 0.77807057, "num_input_tokens_seen": 37606860, "step": 1742, "time_per_iteration": 2.784748077392578 }, { "auxiliary_loss_clip": 0.0113066, "auxiliary_loss_mlp": 0.01057571, "balance_loss_clip": 1.05678856, "balance_loss_mlp": 1.03735304, "epoch": 0.10479482940027056, "flos": 21687208341120.0, "grad_norm": 2.1426857583950416, "language_loss": 0.87614191, "learning_rate": 3.94160369066107e-06, "loss": 0.89802414, "num_input_tokens_seen": 37625210, "step": 1743, "time_per_iteration": 2.819350004196167 }, { "auxiliary_loss_clip": 0.01139959, "auxiliary_loss_mlp": 0.01048534, "balance_loss_clip": 1.0552268, "balance_loss_mlp": 1.0254786, "epoch": 0.10485495265293852, "flos": 21573694385280.0, "grad_norm": 2.060686178474056, "language_loss": 0.75927812, "learning_rate": 3.941510228674391e-06, "loss": 0.7811631, "num_input_tokens_seen": 37644110, "step": 1744, "time_per_iteration": 2.7817211151123047 }, { "auxiliary_loss_clip": 0.01170232, "auxiliary_loss_mlp": 0.01054483, "balance_loss_clip": 1.05992889, "balance_loss_mlp": 1.03442037, "epoch": 0.10491507590560649, "flos": 37961923708800.0, "grad_norm": 1.9689383181633062, "language_loss": 0.78905094, "learning_rate": 3.941416693065451e-06, "loss": 0.81129813, "num_input_tokens_seen": 37665800, "step": 1745, "time_per_iteration": 2.88080096244812 }, { "auxiliary_loss_clip": 0.01180482, "auxiliary_loss_mlp": 0.01060479, "balance_loss_clip": 1.05740213, "balance_loss_mlp": 1.03920031, "epoch": 0.10497519915827447, "flos": 26396282298240.0, "grad_norm": 2.64819141351011, "language_loss": 0.82568693, "learning_rate": 3.941323083837794e-06, "loss": 0.84809649, "num_input_tokens_seen": 37685095, "step": 1746, "time_per_iteration": 2.7068004608154297 }, { "auxiliary_loss_clip": 0.01158367, "auxiliary_loss_mlp": 0.0105595, "balance_loss_clip": 1.05737162, "balance_loss_mlp": 1.03448033, "epoch": 0.10503532241094243, "flos": 40662190581120.0, "grad_norm": 1.6274602877205533, "language_loss": 0.70573747, "learning_rate": 3.941229400994971e-06, "loss": 0.7278806, "num_input_tokens_seen": 37707445, "step": 1747, "time_per_iteration": 2.8689963817596436 }, { "auxiliary_loss_clip": 0.01159389, "auxiliary_loss_mlp": 0.01056346, "balance_loss_clip": 1.06035507, "balance_loss_mlp": 1.03492367, "epoch": 0.1050954456636104, "flos": 29789409588480.0, "grad_norm": 2.386885173400054, "language_loss": 0.8447504, "learning_rate": 3.941135644540535e-06, "loss": 0.86690772, "num_input_tokens_seen": 37728325, "step": 1748, "time_per_iteration": 2.8022749423980713 }, { "auxiliary_loss_clip": 0.01175489, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.05471563, "balance_loss_mlp": 1.02701974, "epoch": 0.10515556891627838, "flos": 23948754497280.0, "grad_norm": 1.759895679837136, "language_loss": 0.71681082, "learning_rate": 3.941041814478041e-06, "loss": 0.73905981, "num_input_tokens_seen": 37748910, "step": 1749, "time_per_iteration": 2.6568849086761475 }, { "auxiliary_loss_clip": 0.01158221, "auxiliary_loss_mlp": 0.01058697, "balance_loss_clip": 1.05427456, "balance_loss_mlp": 1.03590393, "epoch": 0.10521569216894634, "flos": 18259606972800.0, "grad_norm": 2.95022560634889, "language_loss": 0.81510806, "learning_rate": 3.940947910811047e-06, "loss": 0.83727717, "num_input_tokens_seen": 37765745, "step": 1750, "time_per_iteration": 2.6282739639282227 }, { "auxiliary_loss_clip": 0.01156475, "auxiliary_loss_mlp": 0.01062657, "balance_loss_clip": 1.06022298, "balance_loss_mlp": 1.03973269, "epoch": 0.10527581542161431, "flos": 15630909949440.0, "grad_norm": 2.2218325288878953, "language_loss": 0.92364043, "learning_rate": 3.940853933543114e-06, "loss": 0.94583178, "num_input_tokens_seen": 37780520, "step": 1751, "time_per_iteration": 2.703376531600952 }, { "auxiliary_loss_clip": 0.01165779, "auxiliary_loss_mlp": 0.01053304, "balance_loss_clip": 1.0570029, "balance_loss_mlp": 1.03171563, "epoch": 0.10533593867428227, "flos": 18296559089280.0, "grad_norm": 2.0356912608722877, "language_loss": 0.79293752, "learning_rate": 3.940759882677805e-06, "loss": 0.81512833, "num_input_tokens_seen": 37799515, "step": 1752, "time_per_iteration": 2.6501150131225586 }, { "auxiliary_loss_clip": 0.01116865, "auxiliary_loss_mlp": 0.01055489, "balance_loss_clip": 1.05116987, "balance_loss_mlp": 1.03264856, "epoch": 0.10539606192695025, "flos": 29023219555200.0, "grad_norm": 2.022904639316529, "language_loss": 0.75978744, "learning_rate": 3.940665758218686e-06, "loss": 0.78151095, "num_input_tokens_seen": 37818695, "step": 1753, "time_per_iteration": 2.871335744857788 }, { "auxiliary_loss_clip": 0.01141721, "auxiliary_loss_mlp": 0.01057356, "balance_loss_clip": 1.05547547, "balance_loss_mlp": 1.03415775, "epoch": 0.10545618517961822, "flos": 19969313506560.0, "grad_norm": 2.0563919939847914, "language_loss": 0.83969283, "learning_rate": 3.940571560169328e-06, "loss": 0.86168355, "num_input_tokens_seen": 37837860, "step": 1754, "time_per_iteration": 2.685591459274292 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01053577, "balance_loss_clip": 1.05587101, "balance_loss_mlp": 1.03034329, "epoch": 0.10551630843228618, "flos": 16143427157760.0, "grad_norm": 2.7567281016961087, "language_loss": 0.68732727, "learning_rate": 3.940477288533302e-06, "loss": 0.70923102, "num_input_tokens_seen": 37856260, "step": 1755, "time_per_iteration": 2.754117727279663 }, { "auxiliary_loss_clip": 0.01161626, "auxiliary_loss_mlp": 0.010623, "balance_loss_clip": 1.05367684, "balance_loss_mlp": 1.040187, "epoch": 0.10557643168495416, "flos": 23440115957760.0, "grad_norm": 2.26658946748733, "language_loss": 0.76382339, "learning_rate": 3.940382943314182e-06, "loss": 0.7860626, "num_input_tokens_seen": 37876960, "step": 1756, "time_per_iteration": 2.686790943145752 }, { "auxiliary_loss_clip": 0.01182062, "auxiliary_loss_mlp": 0.01062906, "balance_loss_clip": 1.05688286, "balance_loss_mlp": 1.04203284, "epoch": 0.10563655493762213, "flos": 21799034357760.0, "grad_norm": 1.5917029795724482, "language_loss": 0.79926664, "learning_rate": 3.940288524515547e-06, "loss": 0.82171631, "num_input_tokens_seen": 37897070, "step": 1757, "time_per_iteration": 2.6543681621551514 }, { "auxiliary_loss_clip": 0.01149304, "auxiliary_loss_mlp": 0.01057523, "balance_loss_clip": 1.0524838, "balance_loss_mlp": 1.03563643, "epoch": 0.10569667819029009, "flos": 53800863275520.0, "grad_norm": 1.6583181970862437, "language_loss": 0.78714895, "learning_rate": 3.940194032140976e-06, "loss": 0.80921721, "num_input_tokens_seen": 37923635, "step": 1758, "time_per_iteration": 3.013157367706299 }, { "auxiliary_loss_clip": 0.01165597, "auxiliary_loss_mlp": 0.01054919, "balance_loss_clip": 1.05894113, "balance_loss_mlp": 1.03347349, "epoch": 0.10575680144295807, "flos": 22925515760640.0, "grad_norm": 1.870482409236857, "language_loss": 0.91388202, "learning_rate": 3.940099466194054e-06, "loss": 0.93608713, "num_input_tokens_seen": 37942650, "step": 1759, "time_per_iteration": 4.1841137409210205 }, { "auxiliary_loss_clip": 0.0115455, "auxiliary_loss_mlp": 0.01056708, "balance_loss_clip": 1.05242109, "balance_loss_mlp": 1.03346229, "epoch": 0.10581692469562604, "flos": 14136667148160.0, "grad_norm": 2.509404173865799, "language_loss": 0.77406812, "learning_rate": 3.940004826678365e-06, "loss": 0.79618067, "num_input_tokens_seen": 37960660, "step": 1760, "time_per_iteration": 4.476959228515625 }, { "auxiliary_loss_clip": 0.01161737, "auxiliary_loss_mlp": 0.01064522, "balance_loss_clip": 1.0536418, "balance_loss_mlp": 1.04053712, "epoch": 0.105877047948294, "flos": 25958674903680.0, "grad_norm": 2.27300461956159, "language_loss": 0.88896096, "learning_rate": 3.939910113597498e-06, "loss": 0.91122353, "num_input_tokens_seen": 37978625, "step": 1761, "time_per_iteration": 2.6907520294189453 }, { "auxiliary_loss_clip": 0.01110571, "auxiliary_loss_mlp": 0.00782389, "balance_loss_clip": 1.04964042, "balance_loss_mlp": 1.00012767, "epoch": 0.10593717120096197, "flos": 30664768032000.0, "grad_norm": 2.010693315376097, "language_loss": 0.7809304, "learning_rate": 3.9398153269550464e-06, "loss": 0.79986, "num_input_tokens_seen": 38000005, "step": 1762, "time_per_iteration": 2.869051456451416 }, { "auxiliary_loss_clip": 0.01053171, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.02694225, "balance_loss_mlp": 1.05056334, "epoch": 0.10599729445362994, "flos": 66436682497920.0, "grad_norm": 0.8956567750819878, "language_loss": 0.60503203, "learning_rate": 3.939720466754602e-06, "loss": 0.6261009, "num_input_tokens_seen": 38066165, "step": 1763, "time_per_iteration": 5.049196720123291 }, { "auxiliary_loss_clip": 0.01156865, "auxiliary_loss_mlp": 0.01048706, "balance_loss_clip": 1.05424261, "balance_loss_mlp": 1.02708137, "epoch": 0.10605741770629791, "flos": 23948179879680.0, "grad_norm": 2.0510547250099633, "language_loss": 0.80232942, "learning_rate": 3.939625532999763e-06, "loss": 0.82438517, "num_input_tokens_seen": 38086150, "step": 1764, "time_per_iteration": 4.288762807846069 }, { "auxiliary_loss_clip": 0.01136032, "auxiliary_loss_mlp": 0.01055975, "balance_loss_clip": 1.04879069, "balance_loss_mlp": 1.03218043, "epoch": 0.10611754095896588, "flos": 19387524919680.0, "grad_norm": 1.693202084864273, "language_loss": 0.801691, "learning_rate": 3.9395305256941314e-06, "loss": 0.82361102, "num_input_tokens_seen": 38104205, "step": 1765, "time_per_iteration": 2.931269407272339 }, { "auxiliary_loss_clip": 0.01163261, "auxiliary_loss_mlp": 0.01058956, "balance_loss_clip": 1.05457163, "balance_loss_mlp": 1.0367949, "epoch": 0.10617766421163385, "flos": 22237755073920.0, "grad_norm": 1.7665774264343403, "language_loss": 0.76864165, "learning_rate": 3.939435444841306e-06, "loss": 0.79086387, "num_input_tokens_seen": 38122005, "step": 1766, "time_per_iteration": 2.5976176261901855 }, { "auxiliary_loss_clip": 0.01182495, "auxiliary_loss_mlp": 0.01059246, "balance_loss_clip": 1.05923963, "balance_loss_mlp": 1.03766894, "epoch": 0.10623778746430182, "flos": 28404407024640.0, "grad_norm": 1.6265727447650185, "language_loss": 0.77311498, "learning_rate": 3.939340290444895e-06, "loss": 0.79553241, "num_input_tokens_seen": 38143365, "step": 1767, "time_per_iteration": 2.6356630325317383 }, { "auxiliary_loss_clip": 0.01006515, "auxiliary_loss_mlp": 0.01018751, "balance_loss_clip": 1.03004837, "balance_loss_mlp": 1.0151509, "epoch": 0.10629791071696978, "flos": 64234639221120.0, "grad_norm": 0.9172341423433896, "language_loss": 0.57889944, "learning_rate": 3.939245062508506e-06, "loss": 0.59915209, "num_input_tokens_seen": 38210035, "step": 1768, "time_per_iteration": 3.6866471767425537 }, { "auxiliary_loss_clip": 0.01144481, "auxiliary_loss_mlp": 0.01047419, "balance_loss_clip": 1.0546546, "balance_loss_mlp": 1.02687907, "epoch": 0.10635803396963776, "flos": 22747578762240.0, "grad_norm": 1.4529696494540971, "language_loss": 0.86711109, "learning_rate": 3.939149761035749e-06, "loss": 0.8890301, "num_input_tokens_seen": 38231230, "step": 1769, "time_per_iteration": 3.936905860900879 }, { "auxiliary_loss_clip": 0.01141219, "auxiliary_loss_mlp": 0.00780338, "balance_loss_clip": 1.05321527, "balance_loss_mlp": 1.00008726, "epoch": 0.10641815722230573, "flos": 31395586147200.0, "grad_norm": 1.8275276693890916, "language_loss": 0.61906171, "learning_rate": 3.9390543860302395e-06, "loss": 0.63827729, "num_input_tokens_seen": 38253890, "step": 1770, "time_per_iteration": 2.8926138877868652 }, { "auxiliary_loss_clip": 0.01057689, "auxiliary_loss_mlp": 0.01010808, "balance_loss_clip": 1.02007711, "balance_loss_mlp": 1.00775671, "epoch": 0.1064782804749737, "flos": 58552527784320.0, "grad_norm": 0.9163874753670794, "language_loss": 0.57049137, "learning_rate": 3.9389589374955925e-06, "loss": 0.59117633, "num_input_tokens_seen": 38304290, "step": 1771, "time_per_iteration": 3.0783088207244873 }, { "auxiliary_loss_clip": 0.01146276, "auxiliary_loss_mlp": 0.01065918, "balance_loss_clip": 1.05574095, "balance_loss_mlp": 1.04465103, "epoch": 0.10653840372764166, "flos": 23987825516160.0, "grad_norm": 12.794881398939157, "language_loss": 0.88265753, "learning_rate": 3.938863415435429e-06, "loss": 0.90477949, "num_input_tokens_seen": 38324725, "step": 1772, "time_per_iteration": 2.770202159881592 }, { "auxiliary_loss_clip": 0.0118421, "auxiliary_loss_mlp": 0.01058161, "balance_loss_clip": 1.05697048, "balance_loss_mlp": 1.03497458, "epoch": 0.10659852698030964, "flos": 18294655668480.0, "grad_norm": 2.576940958490313, "language_loss": 0.76030588, "learning_rate": 3.93876781985337e-06, "loss": 0.78272957, "num_input_tokens_seen": 38340735, "step": 1773, "time_per_iteration": 2.6177070140838623 }, { "auxiliary_loss_clip": 0.01122733, "auxiliary_loss_mlp": 0.01067657, "balance_loss_clip": 1.04691553, "balance_loss_mlp": 1.04205084, "epoch": 0.1066586502329776, "flos": 32160591031680.0, "grad_norm": 1.868288871406422, "language_loss": 0.8330853, "learning_rate": 3.938672150753041e-06, "loss": 0.85498923, "num_input_tokens_seen": 38361315, "step": 1774, "time_per_iteration": 2.7396061420440674 }, { "auxiliary_loss_clip": 0.01156305, "auxiliary_loss_mlp": 0.00780518, "balance_loss_clip": 1.05627465, "balance_loss_mlp": 1.00011277, "epoch": 0.10671877348564557, "flos": 17785155202560.0, "grad_norm": 2.73383407032925, "language_loss": 0.76446521, "learning_rate": 3.9385764081380704e-06, "loss": 0.78383344, "num_input_tokens_seen": 38377425, "step": 1775, "time_per_iteration": 2.624208927154541 }, { "auxiliary_loss_clip": 0.01063199, "auxiliary_loss_mlp": 0.01007654, "balance_loss_clip": 1.01726675, "balance_loss_mlp": 1.00443542, "epoch": 0.10677889673831355, "flos": 63510177813120.0, "grad_norm": 0.8200823962511624, "language_loss": 0.57477289, "learning_rate": 3.9384805920120876e-06, "loss": 0.5954814, "num_input_tokens_seen": 38440275, "step": 1776, "time_per_iteration": 3.1782386302948 }, { "auxiliary_loss_clip": 0.01150087, "auxiliary_loss_mlp": 0.01066244, "balance_loss_clip": 1.05192852, "balance_loss_mlp": 1.0407691, "epoch": 0.10683901999098151, "flos": 22017694400640.0, "grad_norm": 1.4232532718517703, "language_loss": 0.83442962, "learning_rate": 3.938384702378727e-06, "loss": 0.85659301, "num_input_tokens_seen": 38461820, "step": 1777, "time_per_iteration": 2.7342305183410645 }, { "auxiliary_loss_clip": 0.01113855, "auxiliary_loss_mlp": 0.00780712, "balance_loss_clip": 1.04919302, "balance_loss_mlp": 1.00015831, "epoch": 0.10689914324364948, "flos": 25042952551680.0, "grad_norm": 1.8326039994575831, "language_loss": 0.87207437, "learning_rate": 3.938288739241625e-06, "loss": 0.89102006, "num_input_tokens_seen": 38482235, "step": 1778, "time_per_iteration": 2.859834671020508 }, { "auxiliary_loss_clip": 0.01152509, "auxiliary_loss_mlp": 0.00780436, "balance_loss_clip": 1.06804752, "balance_loss_mlp": 1.00019765, "epoch": 0.10695926649631746, "flos": 16435129507200.0, "grad_norm": 2.4525249429301823, "language_loss": 0.84165859, "learning_rate": 3.938192702604417e-06, "loss": 0.86098808, "num_input_tokens_seen": 38500690, "step": 1779, "time_per_iteration": 2.81423020362854 }, { "auxiliary_loss_clip": 0.01141718, "auxiliary_loss_mlp": 0.00779857, "balance_loss_clip": 1.05215359, "balance_loss_mlp": 1.0001775, "epoch": 0.10701938974898542, "flos": 16979211792000.0, "grad_norm": 1.9378348403129941, "language_loss": 0.66915894, "learning_rate": 3.9380965924707495e-06, "loss": 0.68837464, "num_input_tokens_seen": 38518405, "step": 1780, "time_per_iteration": 2.616684913635254 }, { "auxiliary_loss_clip": 0.01166288, "auxiliary_loss_mlp": 0.01054109, "balance_loss_clip": 1.05843914, "balance_loss_mlp": 1.03268683, "epoch": 0.10707951300165339, "flos": 15888102307200.0, "grad_norm": 1.9168180254288365, "language_loss": 0.92058647, "learning_rate": 3.938000408844265e-06, "loss": 0.94279045, "num_input_tokens_seen": 38535060, "step": 1781, "time_per_iteration": 2.6167802810668945 }, { "auxiliary_loss_clip": 0.0113109, "auxiliary_loss_mlp": 0.01064554, "balance_loss_clip": 1.0531441, "balance_loss_mlp": 1.04344225, "epoch": 0.10713963625432135, "flos": 14247164361600.0, "grad_norm": 1.8357670097294174, "language_loss": 0.79336482, "learning_rate": 3.9379041517286105e-06, "loss": 0.81532121, "num_input_tokens_seen": 38552855, "step": 1782, "time_per_iteration": 2.7669336795806885 }, { "auxiliary_loss_clip": 0.01158369, "auxiliary_loss_mlp": 0.01061646, "balance_loss_clip": 1.05510604, "balance_loss_mlp": 1.04016423, "epoch": 0.10719975950698933, "flos": 16756780821120.0, "grad_norm": 2.0914095256513945, "language_loss": 0.79086542, "learning_rate": 3.937807821127436e-06, "loss": 0.81306553, "num_input_tokens_seen": 38570075, "step": 1783, "time_per_iteration": 2.6349542140960693 }, { "auxiliary_loss_clip": 0.01164267, "auxiliary_loss_mlp": 0.01065333, "balance_loss_clip": 1.0570296, "balance_loss_mlp": 1.04299295, "epoch": 0.1072598827596573, "flos": 22710626645760.0, "grad_norm": 2.1874612027367806, "language_loss": 0.86421812, "learning_rate": 3.937711417044395e-06, "loss": 0.88651407, "num_input_tokens_seen": 38587970, "step": 1784, "time_per_iteration": 2.8452541828155518 }, { "auxiliary_loss_clip": 0.01153461, "auxiliary_loss_mlp": 0.01055605, "balance_loss_clip": 1.05502176, "balance_loss_mlp": 1.03321707, "epoch": 0.10732000601232526, "flos": 23258264376960.0, "grad_norm": 2.4649130783319553, "language_loss": 1.01192284, "learning_rate": 3.937614939483143e-06, "loss": 1.03401351, "num_input_tokens_seen": 38605840, "step": 1785, "time_per_iteration": 2.690018653869629 }, { "auxiliary_loss_clip": 0.01168517, "auxiliary_loss_mlp": 0.01060763, "balance_loss_clip": 1.05854678, "balance_loss_mlp": 1.03984189, "epoch": 0.10738012926499324, "flos": 24207060176640.0, "grad_norm": 1.397915549237645, "language_loss": 0.84951413, "learning_rate": 3.937518388447339e-06, "loss": 0.87180698, "num_input_tokens_seen": 38627070, "step": 1786, "time_per_iteration": 2.637430191040039 }, { "auxiliary_loss_clip": 0.01183118, "auxiliary_loss_mlp": 0.01059079, "balance_loss_clip": 1.05716729, "balance_loss_mlp": 1.03520155, "epoch": 0.1074402525176612, "flos": 20923065383040.0, "grad_norm": 1.7951357311742837, "language_loss": 0.78861409, "learning_rate": 3.937421763940642e-06, "loss": 0.81103605, "num_input_tokens_seen": 38645840, "step": 1787, "time_per_iteration": 2.54508900642395 }, { "auxiliary_loss_clip": 0.01174896, "auxiliary_loss_mlp": 0.01047406, "balance_loss_clip": 1.05971575, "balance_loss_mlp": 1.02528071, "epoch": 0.10750037577032917, "flos": 16946928443520.0, "grad_norm": 1.8536072321218278, "language_loss": 0.82307518, "learning_rate": 3.937325065966719e-06, "loss": 0.84529817, "num_input_tokens_seen": 38664770, "step": 1788, "time_per_iteration": 2.706247568130493 }, { "auxiliary_loss_clip": 0.01180896, "auxiliary_loss_mlp": 0.01064682, "balance_loss_clip": 1.05843878, "balance_loss_mlp": 1.04427314, "epoch": 0.10756049902299715, "flos": 20266546550400.0, "grad_norm": 2.110245519520894, "language_loss": 0.77840686, "learning_rate": 3.9372282945292335e-06, "loss": 0.80086267, "num_input_tokens_seen": 38683865, "step": 1789, "time_per_iteration": 2.6274654865264893 }, { "auxiliary_loss_clip": 0.01185566, "auxiliary_loss_mlp": 0.01065099, "balance_loss_clip": 1.0604099, "balance_loss_mlp": 1.04049408, "epoch": 0.10762062227566511, "flos": 23586523793280.0, "grad_norm": 2.7248977042722524, "language_loss": 0.74817526, "learning_rate": 3.937131449631859e-06, "loss": 0.77068192, "num_input_tokens_seen": 38702485, "step": 1790, "time_per_iteration": 2.624382972717285 }, { "auxiliary_loss_clip": 0.01178128, "auxiliary_loss_mlp": 0.00780572, "balance_loss_clip": 1.06110644, "balance_loss_mlp": 1.00021124, "epoch": 0.10768074552833308, "flos": 24310626065280.0, "grad_norm": 2.350797373347828, "language_loss": 0.78764236, "learning_rate": 3.9370345312782645e-06, "loss": 0.80722934, "num_input_tokens_seen": 38722475, "step": 1791, "time_per_iteration": 2.696162223815918 }, { "auxiliary_loss_clip": 0.01134133, "auxiliary_loss_mlp": 0.01065057, "balance_loss_clip": 1.05280125, "balance_loss_mlp": 1.04117918, "epoch": 0.10774086878100106, "flos": 25299965341440.0, "grad_norm": 1.5879424734455678, "language_loss": 0.70638013, "learning_rate": 3.936937539472126e-06, "loss": 0.7283721, "num_input_tokens_seen": 38743285, "step": 1792, "time_per_iteration": 2.770874261856079 }, { "auxiliary_loss_clip": 0.01149934, "auxiliary_loss_mlp": 0.01051019, "balance_loss_clip": 1.05610943, "balance_loss_mlp": 1.02764249, "epoch": 0.10780099203366902, "flos": 22054035985920.0, "grad_norm": 1.920104493539276, "language_loss": 0.76565266, "learning_rate": 3.9368404742171236e-06, "loss": 0.78766215, "num_input_tokens_seen": 38763035, "step": 1793, "time_per_iteration": 2.7218761444091797 }, { "auxiliary_loss_clip": 0.01116412, "auxiliary_loss_mlp": 0.01064574, "balance_loss_clip": 1.05029237, "balance_loss_mlp": 1.0414238, "epoch": 0.10786111528633699, "flos": 22747471021440.0, "grad_norm": 1.7475786500241859, "language_loss": 0.85103315, "learning_rate": 3.936743335516936e-06, "loss": 0.87284303, "num_input_tokens_seen": 38784900, "step": 1794, "time_per_iteration": 2.7590620517730713 }, { "auxiliary_loss_clip": 0.01115198, "auxiliary_loss_mlp": 0.01055294, "balance_loss_clip": 1.04807687, "balance_loss_mlp": 1.03146446, "epoch": 0.10792123853900495, "flos": 20851064570880.0, "grad_norm": 2.5236234593460924, "language_loss": 0.74585378, "learning_rate": 3.936646123375246e-06, "loss": 0.76755869, "num_input_tokens_seen": 38804695, "step": 1795, "time_per_iteration": 2.8500585556030273 }, { "auxiliary_loss_clip": 0.01124895, "auxiliary_loss_mlp": 0.01058294, "balance_loss_clip": 1.04831553, "balance_loss_mlp": 1.03479767, "epoch": 0.10798136179167293, "flos": 17748705876480.0, "grad_norm": 2.842374039298248, "language_loss": 0.81653619, "learning_rate": 3.936548837795741e-06, "loss": 0.83836806, "num_input_tokens_seen": 38822395, "step": 1796, "time_per_iteration": 2.7549750804901123 }, { "auxiliary_loss_clip": 0.01140492, "auxiliary_loss_mlp": 0.01083966, "balance_loss_clip": 1.05246449, "balance_loss_mlp": 1.05721593, "epoch": 0.1080414850443409, "flos": 13589639948160.0, "grad_norm": 2.59635455269928, "language_loss": 0.74233043, "learning_rate": 3.936451478782111e-06, "loss": 0.764575, "num_input_tokens_seen": 38839865, "step": 1797, "time_per_iteration": 2.6396753787994385 }, { "auxiliary_loss_clip": 0.01160286, "auxiliary_loss_mlp": 0.01049954, "balance_loss_clip": 1.05505061, "balance_loss_mlp": 1.02874684, "epoch": 0.10810160829700886, "flos": 16253421580800.0, "grad_norm": 2.0852339617015025, "language_loss": 0.81855786, "learning_rate": 3.936354046338046e-06, "loss": 0.84066033, "num_input_tokens_seen": 38857300, "step": 1798, "time_per_iteration": 2.7105324268341064 }, { "auxiliary_loss_clip": 0.01142859, "auxiliary_loss_mlp": 0.01054502, "balance_loss_clip": 1.05379176, "balance_loss_mlp": 1.03117299, "epoch": 0.10816173154967684, "flos": 15158002464000.0, "grad_norm": 2.4443000829323687, "language_loss": 0.85516405, "learning_rate": 3.936256540467242e-06, "loss": 0.87713766, "num_input_tokens_seen": 38874960, "step": 1799, "time_per_iteration": 4.159978628158569 }, { "auxiliary_loss_clip": 0.01154352, "auxiliary_loss_mlp": 0.01062903, "balance_loss_clip": 1.05493283, "balance_loss_mlp": 1.04114687, "epoch": 0.10822185480234481, "flos": 17785334770560.0, "grad_norm": 2.7405734706827825, "language_loss": 0.77434146, "learning_rate": 3.9361589611733955e-06, "loss": 0.79651403, "num_input_tokens_seen": 38893610, "step": 1800, "time_per_iteration": 4.52047872543335 }, { "auxiliary_loss_clip": 0.01178634, "auxiliary_loss_mlp": 0.0104758, "balance_loss_clip": 1.05722904, "balance_loss_mlp": 1.02689719, "epoch": 0.10828197805501277, "flos": 25556654908800.0, "grad_norm": 1.582468034859118, "language_loss": 0.72897375, "learning_rate": 3.9360613084602075e-06, "loss": 0.75123584, "num_input_tokens_seen": 38913485, "step": 1801, "time_per_iteration": 4.291400909423828 }, { "auxiliary_loss_clip": 0.01190595, "auxiliary_loss_mlp": 0.01056056, "balance_loss_clip": 1.06095624, "balance_loss_mlp": 1.03478956, "epoch": 0.10834210130768075, "flos": 28984435845120.0, "grad_norm": 1.951139287607183, "language_loss": 0.6634692, "learning_rate": 3.935963582331381e-06, "loss": 0.68593562, "num_input_tokens_seen": 38935650, "step": 1802, "time_per_iteration": 2.722628355026245 }, { "auxiliary_loss_clip": 0.01155661, "auxiliary_loss_mlp": 0.01059375, "balance_loss_clip": 1.05326533, "balance_loss_mlp": 1.03695142, "epoch": 0.10840222456034872, "flos": 20264212166400.0, "grad_norm": 2.084551157592464, "language_loss": 0.81612957, "learning_rate": 3.935865782790621e-06, "loss": 0.8382799, "num_input_tokens_seen": 38954130, "step": 1803, "time_per_iteration": 4.239379167556763 }, { "auxiliary_loss_clip": 0.01163104, "auxiliary_loss_mlp": 0.01061781, "balance_loss_clip": 1.0567112, "balance_loss_mlp": 1.03921473, "epoch": 0.10846234781301668, "flos": 19863054097920.0, "grad_norm": 1.9102934552723363, "language_loss": 0.91127038, "learning_rate": 3.9357679098416365e-06, "loss": 0.93351918, "num_input_tokens_seen": 38972905, "step": 1804, "time_per_iteration": 2.5836737155914307 }, { "auxiliary_loss_clip": 0.01136188, "auxiliary_loss_mlp": 0.01060133, "balance_loss_clip": 1.05617714, "balance_loss_mlp": 1.03718543, "epoch": 0.10852247106568465, "flos": 26469037296000.0, "grad_norm": 2.5742522317806262, "language_loss": 0.76198906, "learning_rate": 3.935669963488139e-06, "loss": 0.78395224, "num_input_tokens_seen": 38993255, "step": 1805, "time_per_iteration": 2.783137321472168 }, { "auxiliary_loss_clip": 0.01149468, "auxiliary_loss_mlp": 0.01050946, "balance_loss_clip": 1.05419612, "balance_loss_mlp": 1.03050184, "epoch": 0.10858259431835263, "flos": 30081506987520.0, "grad_norm": 1.7049574807827799, "language_loss": 0.85876733, "learning_rate": 3.935571943733843e-06, "loss": 0.88077152, "num_input_tokens_seen": 39012610, "step": 1806, "time_per_iteration": 2.8148701190948486 }, { "auxiliary_loss_clip": 0.01168733, "auxiliary_loss_mlp": 0.00779888, "balance_loss_clip": 1.05462408, "balance_loss_mlp": 1.00006652, "epoch": 0.10864271757102059, "flos": 19063180085760.0, "grad_norm": 2.554050049117878, "language_loss": 0.8108198, "learning_rate": 3.9354738505824635e-06, "loss": 0.83030605, "num_input_tokens_seen": 39030120, "step": 1807, "time_per_iteration": 2.6275649070739746 }, { "auxiliary_loss_clip": 0.01139085, "auxiliary_loss_mlp": 0.01055438, "balance_loss_clip": 1.05193985, "balance_loss_mlp": 1.03522038, "epoch": 0.10870284082368856, "flos": 24715052271360.0, "grad_norm": 1.834914777588586, "language_loss": 0.78910971, "learning_rate": 3.9353756840377225e-06, "loss": 0.81105494, "num_input_tokens_seen": 39049875, "step": 1808, "time_per_iteration": 2.722910165786743 }, { "auxiliary_loss_clip": 0.01157997, "auxiliary_loss_mlp": 0.01056971, "balance_loss_clip": 1.05918014, "balance_loss_mlp": 1.03548992, "epoch": 0.10876296407635654, "flos": 20627663932800.0, "grad_norm": 1.6201371380093192, "language_loss": 0.79013431, "learning_rate": 3.935277444103342e-06, "loss": 0.81228393, "num_input_tokens_seen": 39068935, "step": 1809, "time_per_iteration": 2.7261481285095215 }, { "auxiliary_loss_clip": 0.01180468, "auxiliary_loss_mlp": 0.01057915, "balance_loss_clip": 1.0568099, "balance_loss_mlp": 1.03705359, "epoch": 0.1088230873290245, "flos": 21579835610880.0, "grad_norm": 1.9004896030263678, "language_loss": 0.85129547, "learning_rate": 3.935179130783046e-06, "loss": 0.87367928, "num_input_tokens_seen": 39087370, "step": 1810, "time_per_iteration": 2.672696828842163 }, { "auxiliary_loss_clip": 0.01124301, "auxiliary_loss_mlp": 0.01057363, "balance_loss_clip": 1.04580724, "balance_loss_mlp": 1.0335803, "epoch": 0.10888321058169247, "flos": 26469037296000.0, "grad_norm": 1.5993643379141278, "language_loss": 0.63822675, "learning_rate": 3.935080744080564e-06, "loss": 0.66004336, "num_input_tokens_seen": 39106635, "step": 1811, "time_per_iteration": 2.7731611728668213 }, { "auxiliary_loss_clip": 0.01151891, "auxiliary_loss_mlp": 0.01050225, "balance_loss_clip": 1.05335796, "balance_loss_mlp": 1.02836192, "epoch": 0.10894333383436045, "flos": 25848608653440.0, "grad_norm": 1.9284151803363307, "language_loss": 0.74238706, "learning_rate": 3.934982283999626e-06, "loss": 0.76440823, "num_input_tokens_seen": 39126335, "step": 1812, "time_per_iteration": 2.727743625640869 }, { "auxiliary_loss_clip": 0.01142498, "auxiliary_loss_mlp": 0.01057826, "balance_loss_clip": 1.05199611, "balance_loss_mlp": 1.03546214, "epoch": 0.10900345708702841, "flos": 19537093152000.0, "grad_norm": 1.5783196636767667, "language_loss": 0.72746086, "learning_rate": 3.934883750543966e-06, "loss": 0.74946409, "num_input_tokens_seen": 39144820, "step": 1813, "time_per_iteration": 2.798297166824341 }, { "auxiliary_loss_clip": 0.0113892, "auxiliary_loss_mlp": 0.01056639, "balance_loss_clip": 1.0511452, "balance_loss_mlp": 1.03515792, "epoch": 0.10906358033969638, "flos": 23623296341760.0, "grad_norm": 1.635228619121262, "language_loss": 0.82981038, "learning_rate": 3.93478514371732e-06, "loss": 0.85176599, "num_input_tokens_seen": 39165945, "step": 1814, "time_per_iteration": 2.7120048999786377 }, { "auxiliary_loss_clip": 0.01141958, "auxiliary_loss_mlp": 0.01058857, "balance_loss_clip": 1.0537864, "balance_loss_mlp": 1.03787625, "epoch": 0.10912370359236434, "flos": 21214731818880.0, "grad_norm": 1.9556743991494996, "language_loss": 0.84310579, "learning_rate": 3.934686463523429e-06, "loss": 0.86511397, "num_input_tokens_seen": 39183520, "step": 1815, "time_per_iteration": 2.788870096206665 }, { "auxiliary_loss_clip": 0.01146878, "auxiliary_loss_mlp": 0.01055141, "balance_loss_clip": 1.05443966, "balance_loss_mlp": 1.03182411, "epoch": 0.10918382684503232, "flos": 13553190622080.0, "grad_norm": 2.5374826422013195, "language_loss": 0.71670222, "learning_rate": 3.9345877099660315e-06, "loss": 0.73872244, "num_input_tokens_seen": 39201190, "step": 1816, "time_per_iteration": 2.8424103260040283 }, { "auxiliary_loss_clip": 0.01164173, "auxiliary_loss_mlp": 0.01064184, "balance_loss_clip": 1.05216932, "balance_loss_mlp": 1.04052126, "epoch": 0.10924395009770028, "flos": 27964321591680.0, "grad_norm": 2.016899555923086, "language_loss": 0.72880268, "learning_rate": 3.9344888830488744e-06, "loss": 0.75108624, "num_input_tokens_seen": 39221210, "step": 1817, "time_per_iteration": 2.7320947647094727 }, { "auxiliary_loss_clip": 0.01116915, "auxiliary_loss_mlp": 0.01057856, "balance_loss_clip": 1.05173278, "balance_loss_mlp": 1.03517008, "epoch": 0.10930407335036825, "flos": 25593750679680.0, "grad_norm": 1.5988628345308824, "language_loss": 0.67275256, "learning_rate": 3.934389982775706e-06, "loss": 0.69450033, "num_input_tokens_seen": 39242025, "step": 1818, "time_per_iteration": 2.8700790405273438 }, { "auxiliary_loss_clip": 0.01155804, "auxiliary_loss_mlp": 0.01065952, "balance_loss_clip": 1.05673873, "balance_loss_mlp": 1.04313517, "epoch": 0.10936419660303623, "flos": 18406194376320.0, "grad_norm": 3.593580913512793, "language_loss": 0.73149616, "learning_rate": 3.934291009150275e-06, "loss": 0.75371373, "num_input_tokens_seen": 39259870, "step": 1819, "time_per_iteration": 2.7091007232666016 }, { "auxiliary_loss_clip": 0.01142955, "auxiliary_loss_mlp": 0.00779155, "balance_loss_clip": 1.05341268, "balance_loss_mlp": 1.00027704, "epoch": 0.1094243198557042, "flos": 23840052963840.0, "grad_norm": 4.531598275817935, "language_loss": 0.73764241, "learning_rate": 3.934191962176335e-06, "loss": 0.75686359, "num_input_tokens_seen": 39278500, "step": 1820, "time_per_iteration": 2.6513099670410156 }, { "auxiliary_loss_clip": 0.01179358, "auxiliary_loss_mlp": 0.01056073, "balance_loss_clip": 1.05747604, "balance_loss_mlp": 1.03297031, "epoch": 0.10948444310837216, "flos": 14643940970880.0, "grad_norm": 2.2567103978329337, "language_loss": 0.82532805, "learning_rate": 3.934092841857642e-06, "loss": 0.84768236, "num_input_tokens_seen": 39294800, "step": 1821, "time_per_iteration": 2.5348384380340576 }, { "auxiliary_loss_clip": 0.01148016, "auxiliary_loss_mlp": 0.01052031, "balance_loss_clip": 1.05133605, "balance_loss_mlp": 1.03077567, "epoch": 0.10954456636104014, "flos": 27818811596160.0, "grad_norm": 2.0770330480401578, "language_loss": 0.76271641, "learning_rate": 3.933993648197955e-06, "loss": 0.7847169, "num_input_tokens_seen": 39314625, "step": 1822, "time_per_iteration": 2.730079174041748 }, { "auxiliary_loss_clip": 0.01142446, "auxiliary_loss_mlp": 0.01049259, "balance_loss_clip": 1.04849207, "balance_loss_mlp": 1.02856421, "epoch": 0.1096046896137081, "flos": 33620934372480.0, "grad_norm": 1.734419613996414, "language_loss": 0.79309607, "learning_rate": 3.933894381201034e-06, "loss": 0.81501311, "num_input_tokens_seen": 39336465, "step": 1823, "time_per_iteration": 2.756969928741455 }, { "auxiliary_loss_clip": 0.01148165, "auxiliary_loss_mlp": 0.01049595, "balance_loss_clip": 1.05160606, "balance_loss_mlp": 1.02745807, "epoch": 0.10966481286637607, "flos": 26980010219520.0, "grad_norm": 1.4318009514182364, "language_loss": 0.79590744, "learning_rate": 3.933795040870645e-06, "loss": 0.81788504, "num_input_tokens_seen": 39357930, "step": 1824, "time_per_iteration": 2.798168182373047 }, { "auxiliary_loss_clip": 0.01142146, "auxiliary_loss_mlp": 0.01055513, "balance_loss_clip": 1.05104232, "balance_loss_mlp": 1.03381693, "epoch": 0.10972493611904403, "flos": 23036551678080.0, "grad_norm": 2.127143421089703, "language_loss": 0.88138539, "learning_rate": 3.933695627210554e-06, "loss": 0.90336192, "num_input_tokens_seen": 39376380, "step": 1825, "time_per_iteration": 2.6804513931274414 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01056127, "balance_loss_clip": 1.04586983, "balance_loss_mlp": 1.03439498, "epoch": 0.10978505937171201, "flos": 38104632443520.0, "grad_norm": 1.721192594935189, "language_loss": 0.76441038, "learning_rate": 3.933596140224532e-06, "loss": 0.78625786, "num_input_tokens_seen": 39399935, "step": 1826, "time_per_iteration": 2.8315086364746094 }, { "auxiliary_loss_clip": 0.01063155, "auxiliary_loss_mlp": 0.01016957, "balance_loss_clip": 1.02709544, "balance_loss_mlp": 1.01409554, "epoch": 0.10984518262437998, "flos": 59849694616320.0, "grad_norm": 0.8518463216820418, "language_loss": 0.54997343, "learning_rate": 3.93349657991635e-06, "loss": 0.57077461, "num_input_tokens_seen": 39460685, "step": 1827, "time_per_iteration": 3.1425766944885254 }, { "auxiliary_loss_clip": 0.01072651, "auxiliary_loss_mlp": 0.01010167, "balance_loss_clip": 1.02693772, "balance_loss_mlp": 1.00717473, "epoch": 0.10990530587704794, "flos": 66719837410560.0, "grad_norm": 0.7375455878808789, "language_loss": 0.55382878, "learning_rate": 3.933396946289784e-06, "loss": 0.57465696, "num_input_tokens_seen": 39524765, "step": 1828, "time_per_iteration": 3.168165922164917 }, { "auxiliary_loss_clip": 0.01156998, "auxiliary_loss_mlp": 0.01059335, "balance_loss_clip": 1.05407059, "balance_loss_mlp": 1.03618491, "epoch": 0.10996542912971592, "flos": 25447199189760.0, "grad_norm": 2.250827401167328, "language_loss": 0.84010404, "learning_rate": 3.933297239348612e-06, "loss": 0.86226743, "num_input_tokens_seen": 39543640, "step": 1829, "time_per_iteration": 2.7341628074645996 }, { "auxiliary_loss_clip": 0.01130747, "auxiliary_loss_mlp": 0.01053464, "balance_loss_clip": 1.0547024, "balance_loss_mlp": 1.03036165, "epoch": 0.11002555238238389, "flos": 44018186186880.0, "grad_norm": 2.342204785330024, "language_loss": 0.88880253, "learning_rate": 3.933197459096614e-06, "loss": 0.91064465, "num_input_tokens_seen": 39567525, "step": 1830, "time_per_iteration": 2.9093260765075684 }, { "auxiliary_loss_clip": 0.01049643, "auxiliary_loss_mlp": 0.01009685, "balance_loss_clip": 1.02618647, "balance_loss_mlp": 1.00681162, "epoch": 0.11008567563505185, "flos": 54065133590400.0, "grad_norm": 0.6882192363357665, "language_loss": 0.55566543, "learning_rate": 3.9330976055375756e-06, "loss": 0.57625872, "num_input_tokens_seen": 39628470, "step": 1831, "time_per_iteration": 3.1713974475860596 }, { "auxiliary_loss_clip": 0.01156783, "auxiliary_loss_mlp": 0.01073931, "balance_loss_clip": 1.05708003, "balance_loss_mlp": 1.04965997, "epoch": 0.11014579888771983, "flos": 24243150366720.0, "grad_norm": 2.4937725361201495, "language_loss": 0.90836191, "learning_rate": 3.932997678675282e-06, "loss": 0.93066907, "num_input_tokens_seen": 39646670, "step": 1832, "time_per_iteration": 2.6786489486694336 }, { "auxiliary_loss_clip": 0.0106111, "auxiliary_loss_mlp": 0.01010664, "balance_loss_clip": 1.02332854, "balance_loss_mlp": 1.00769615, "epoch": 0.1102059221403878, "flos": 57743965658880.0, "grad_norm": 0.7154576595208243, "language_loss": 0.59911001, "learning_rate": 3.932897678513523e-06, "loss": 0.61982775, "num_input_tokens_seen": 39712915, "step": 1833, "time_per_iteration": 3.1802401542663574 }, { "auxiliary_loss_clip": 0.01167201, "auxiliary_loss_mlp": 0.0105502, "balance_loss_clip": 1.05312014, "balance_loss_mlp": 1.03285873, "epoch": 0.11026604539305576, "flos": 16795923667200.0, "grad_norm": 2.6772934272606923, "language_loss": 0.80799395, "learning_rate": 3.93279760505609e-06, "loss": 0.83021617, "num_input_tokens_seen": 39730650, "step": 1834, "time_per_iteration": 2.591374635696411 }, { "auxiliary_loss_clip": 0.01141662, "auxiliary_loss_mlp": 0.01054827, "balance_loss_clip": 1.05557871, "balance_loss_mlp": 1.03004324, "epoch": 0.11032616864572373, "flos": 23988076911360.0, "grad_norm": 2.4853906687508247, "language_loss": 0.89856094, "learning_rate": 3.932697458306779e-06, "loss": 0.92052579, "num_input_tokens_seen": 39751065, "step": 1835, "time_per_iteration": 2.742330312728882 }, { "auxiliary_loss_clip": 0.01131787, "auxiliary_loss_mlp": 0.01063812, "balance_loss_clip": 1.0524013, "balance_loss_mlp": 1.03758645, "epoch": 0.1103862918983917, "flos": 19683141851520.0, "grad_norm": 2.2754442269720023, "language_loss": 0.63256055, "learning_rate": 3.932597238269386e-06, "loss": 0.65451658, "num_input_tokens_seen": 39769245, "step": 1836, "time_per_iteration": 2.6935038566589355 }, { "auxiliary_loss_clip": 0.01138919, "auxiliary_loss_mlp": 0.01061469, "balance_loss_clip": 1.05021358, "balance_loss_mlp": 1.03954661, "epoch": 0.11044641515105967, "flos": 32160878340480.0, "grad_norm": 1.6726289784191204, "language_loss": 0.72792488, "learning_rate": 3.932496944947711e-06, "loss": 0.74992871, "num_input_tokens_seen": 39790830, "step": 1837, "time_per_iteration": 2.7790510654449463 }, { "auxiliary_loss_clip": 0.01165472, "auxiliary_loss_mlp": 0.01057035, "balance_loss_clip": 1.05463088, "balance_loss_mlp": 1.03551781, "epoch": 0.11050653840372764, "flos": 16689233295360.0, "grad_norm": 2.027055787194766, "language_loss": 0.78489268, "learning_rate": 3.93239657834556e-06, "loss": 0.8071177, "num_input_tokens_seen": 39809475, "step": 1838, "time_per_iteration": 4.098532438278198 }, { "auxiliary_loss_clip": 0.01154042, "auxiliary_loss_mlp": 0.01062407, "balance_loss_clip": 1.05542612, "balance_loss_mlp": 1.03970969, "epoch": 0.11056666165639562, "flos": 21208877902080.0, "grad_norm": 2.046221888979386, "language_loss": 0.71451718, "learning_rate": 3.932296138466736e-06, "loss": 0.7366817, "num_input_tokens_seen": 39826355, "step": 1839, "time_per_iteration": 4.205714464187622 }, { "auxiliary_loss_clip": 0.01187588, "auxiliary_loss_mlp": 0.00781104, "balance_loss_clip": 1.06183171, "balance_loss_mlp": 1.00018013, "epoch": 0.11062678490906358, "flos": 19165488998400.0, "grad_norm": 2.623062836625425, "language_loss": 0.79027873, "learning_rate": 3.93219562531505e-06, "loss": 0.80996567, "num_input_tokens_seen": 39845335, "step": 1840, "time_per_iteration": 2.6023378372192383 }, { "auxiliary_loss_clip": 0.01156508, "auxiliary_loss_mlp": 0.01052512, "balance_loss_clip": 1.05206251, "balance_loss_mlp": 1.02887261, "epoch": 0.11068690816173155, "flos": 24895287740160.0, "grad_norm": 1.7551987843009527, "language_loss": 0.88083529, "learning_rate": 3.932095038894311e-06, "loss": 0.90292549, "num_input_tokens_seen": 39865065, "step": 1841, "time_per_iteration": 4.3361639976501465 }, { "auxiliary_loss_clip": 0.01130203, "auxiliary_loss_mlp": 0.01067683, "balance_loss_clip": 1.05036247, "balance_loss_mlp": 1.04453301, "epoch": 0.11074703141439952, "flos": 16472368932480.0, "grad_norm": 3.1603067125494126, "language_loss": 0.90521991, "learning_rate": 3.931994379208334e-06, "loss": 0.92719877, "num_input_tokens_seen": 39882780, "step": 1842, "time_per_iteration": 2.7086760997772217 }, { "auxiliary_loss_clip": 0.01152506, "auxiliary_loss_mlp": 0.01061227, "balance_loss_clip": 1.05065131, "balance_loss_mlp": 1.03982854, "epoch": 0.11080715466706749, "flos": 19172420323200.0, "grad_norm": 2.112801816568727, "language_loss": 0.85845053, "learning_rate": 3.931893646260937e-06, "loss": 0.88058788, "num_input_tokens_seen": 39900295, "step": 1843, "time_per_iteration": 4.263117790222168 }, { "auxiliary_loss_clip": 0.01119254, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.05050898, "balance_loss_mlp": 1.00012159, "epoch": 0.11086727791973545, "flos": 27704687109120.0, "grad_norm": 1.4511349711086798, "language_loss": 0.74735641, "learning_rate": 3.931792840055941e-06, "loss": 0.76637971, "num_input_tokens_seen": 39922075, "step": 1844, "time_per_iteration": 2.7999000549316406 }, { "auxiliary_loss_clip": 0.01180395, "auxiliary_loss_mlp": 0.01055824, "balance_loss_clip": 1.05662274, "balance_loss_mlp": 1.03238785, "epoch": 0.11092740117240343, "flos": 18514967736960.0, "grad_norm": 2.017286766878137, "language_loss": 0.7566812, "learning_rate": 3.931691960597165e-06, "loss": 0.77904338, "num_input_tokens_seen": 39940115, "step": 1845, "time_per_iteration": 2.5305535793304443 }, { "auxiliary_loss_clip": 0.01153403, "auxiliary_loss_mlp": 0.01058911, "balance_loss_clip": 1.05442989, "balance_loss_mlp": 1.03807366, "epoch": 0.1109875244250714, "flos": 20522446018560.0, "grad_norm": 1.9628359583393364, "language_loss": 0.75953126, "learning_rate": 3.9315910078884375e-06, "loss": 0.78165436, "num_input_tokens_seen": 39959920, "step": 1846, "time_per_iteration": 2.719325542449951 }, { "auxiliary_loss_clip": 0.01173899, "auxiliary_loss_mlp": 0.01059369, "balance_loss_clip": 1.05823123, "balance_loss_mlp": 1.03717244, "epoch": 0.11104764767773936, "flos": 14098601710080.0, "grad_norm": 2.612459533347621, "language_loss": 0.8620472, "learning_rate": 3.931489981933584e-06, "loss": 0.88437986, "num_input_tokens_seen": 39974755, "step": 1847, "time_per_iteration": 2.7705559730529785 }, { "auxiliary_loss_clip": 0.01181158, "auxiliary_loss_mlp": 0.01055145, "balance_loss_clip": 1.05562854, "balance_loss_mlp": 1.0322808, "epoch": 0.11110777093040733, "flos": 20594518657920.0, "grad_norm": 1.8452742714770096, "language_loss": 0.76981926, "learning_rate": 3.931388882736438e-06, "loss": 0.79218227, "num_input_tokens_seen": 39993355, "step": 1848, "time_per_iteration": 2.605933666229248 }, { "auxiliary_loss_clip": 0.01172398, "auxiliary_loss_mlp": 0.01056349, "balance_loss_clip": 1.06262445, "balance_loss_mlp": 1.03455794, "epoch": 0.11116789418307531, "flos": 21870065502720.0, "grad_norm": 1.6943193134392138, "language_loss": 0.77621841, "learning_rate": 3.931287710300832e-06, "loss": 0.7985059, "num_input_tokens_seen": 40012410, "step": 1849, "time_per_iteration": 2.678415536880493 }, { "auxiliary_loss_clip": 0.01138995, "auxiliary_loss_mlp": 0.00781122, "balance_loss_clip": 1.05277848, "balance_loss_mlp": 1.00010324, "epoch": 0.11122801743574327, "flos": 15523106256000.0, "grad_norm": 3.3234972538165066, "language_loss": 0.72098577, "learning_rate": 3.931186464630601e-06, "loss": 0.74018693, "num_input_tokens_seen": 40029315, "step": 1850, "time_per_iteration": 2.7763028144836426 }, { "auxiliary_loss_clip": 0.01170569, "auxiliary_loss_mlp": 0.01061108, "balance_loss_clip": 1.05759382, "balance_loss_mlp": 1.03874469, "epoch": 0.11128814068841124, "flos": 14392279307520.0, "grad_norm": 2.0638339407107873, "language_loss": 0.81499028, "learning_rate": 3.931085145729588e-06, "loss": 0.83730704, "num_input_tokens_seen": 40045765, "step": 1851, "time_per_iteration": 2.688854694366455 }, { "auxiliary_loss_clip": 0.01164692, "auxiliary_loss_mlp": 0.01061301, "balance_loss_clip": 1.05789042, "balance_loss_mlp": 1.04027295, "epoch": 0.11134826394107922, "flos": 16653933204480.0, "grad_norm": 2.365035468310974, "language_loss": 0.88270009, "learning_rate": 3.930983753601631e-06, "loss": 0.90496004, "num_input_tokens_seen": 40061660, "step": 1852, "time_per_iteration": 2.659914493560791 }, { "auxiliary_loss_clip": 0.01166772, "auxiliary_loss_mlp": 0.01060698, "balance_loss_clip": 1.05489326, "balance_loss_mlp": 1.03791702, "epoch": 0.11140838719374718, "flos": 16690993061760.0, "grad_norm": 2.1825610274136054, "language_loss": 0.72492862, "learning_rate": 3.930882288250578e-06, "loss": 0.74720335, "num_input_tokens_seen": 40080180, "step": 1853, "time_per_iteration": 2.7840964794158936 }, { "auxiliary_loss_clip": 0.01069898, "auxiliary_loss_mlp": 0.01019902, "balance_loss_clip": 1.02549517, "balance_loss_mlp": 1.01701725, "epoch": 0.11146851044641515, "flos": 60976355587200.0, "grad_norm": 0.772231443606995, "language_loss": 0.53664064, "learning_rate": 3.930780749680273e-06, "loss": 0.55753863, "num_input_tokens_seen": 40138910, "step": 1854, "time_per_iteration": 3.089354991912842 }, { "auxiliary_loss_clip": 0.01159576, "auxiliary_loss_mlp": 0.0105585, "balance_loss_clip": 1.05390525, "balance_loss_mlp": 1.03184092, "epoch": 0.11152863369908313, "flos": 22193835719040.0, "grad_norm": 1.863523240792578, "language_loss": 0.8468501, "learning_rate": 3.9306791378945705e-06, "loss": 0.86900431, "num_input_tokens_seen": 40157745, "step": 1855, "time_per_iteration": 2.7361156940460205 }, { "auxiliary_loss_clip": 0.01147504, "auxiliary_loss_mlp": 0.01064479, "balance_loss_clip": 1.05225825, "balance_loss_mlp": 1.0424726, "epoch": 0.11158875695175109, "flos": 19537524115200.0, "grad_norm": 2.1217067547931756, "language_loss": 0.81187081, "learning_rate": 3.9305774528973205e-06, "loss": 0.83399057, "num_input_tokens_seen": 40175375, "step": 1856, "time_per_iteration": 2.7158002853393555 }, { "auxiliary_loss_clip": 0.01168288, "auxiliary_loss_mlp": 0.01052259, "balance_loss_clip": 1.05843937, "balance_loss_mlp": 1.02957392, "epoch": 0.11164888020441906, "flos": 25442709989760.0, "grad_norm": 2.0555738298465314, "language_loss": 0.82761133, "learning_rate": 3.93047569469238e-06, "loss": 0.8498168, "num_input_tokens_seen": 40195715, "step": 1857, "time_per_iteration": 2.647184133529663 }, { "auxiliary_loss_clip": 0.01144196, "auxiliary_loss_mlp": 0.01044915, "balance_loss_clip": 1.05255508, "balance_loss_mlp": 1.02395833, "epoch": 0.11170900345708702, "flos": 15632741543040.0, "grad_norm": 2.3199985887988914, "language_loss": 0.83131742, "learning_rate": 3.930373863283608e-06, "loss": 0.85320854, "num_input_tokens_seen": 40213975, "step": 1858, "time_per_iteration": 2.726905107498169 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.01067658, "balance_loss_clip": 1.04900265, "balance_loss_mlp": 1.04350638, "epoch": 0.111769126709755, "flos": 23039424766080.0, "grad_norm": 2.0395414997027657, "language_loss": 0.9133389, "learning_rate": 3.930271958674866e-06, "loss": 0.93536508, "num_input_tokens_seen": 40233905, "step": 1859, "time_per_iteration": 3.0006766319274902 }, { "auxiliary_loss_clip": 0.01167289, "auxiliary_loss_mlp": 0.01049698, "balance_loss_clip": 1.05445409, "balance_loss_mlp": 1.02751315, "epoch": 0.11182924996242297, "flos": 20850705434880.0, "grad_norm": 2.048197345879043, "language_loss": 0.81528586, "learning_rate": 3.930169980870018e-06, "loss": 0.83745575, "num_input_tokens_seen": 40252810, "step": 1860, "time_per_iteration": 2.7216553688049316 }, { "auxiliary_loss_clip": 0.01154007, "auxiliary_loss_mlp": 0.01060885, "balance_loss_clip": 1.05737674, "balance_loss_mlp": 1.03920078, "epoch": 0.11188937321509093, "flos": 17455315587840.0, "grad_norm": 2.00330439318394, "language_loss": 0.75250578, "learning_rate": 3.930067929872931e-06, "loss": 0.77465475, "num_input_tokens_seen": 40272000, "step": 1861, "time_per_iteration": 2.6878490447998047 }, { "auxiliary_loss_clip": 0.01177651, "auxiliary_loss_mlp": 0.01054452, "balance_loss_clip": 1.0565964, "balance_loss_mlp": 1.03360212, "epoch": 0.11194949646775891, "flos": 24095916518400.0, "grad_norm": 1.9427039767358767, "language_loss": 0.88888168, "learning_rate": 3.929965805687474e-06, "loss": 0.91120267, "num_input_tokens_seen": 40290660, "step": 1862, "time_per_iteration": 2.615057945251465 }, { "auxiliary_loss_clip": 0.01164251, "auxiliary_loss_mlp": 0.01062894, "balance_loss_clip": 1.05994737, "balance_loss_mlp": 1.04086459, "epoch": 0.11200961972042688, "flos": 25153880728320.0, "grad_norm": 2.2273555113866847, "language_loss": 0.87719512, "learning_rate": 3.92986360831752e-06, "loss": 0.89946657, "num_input_tokens_seen": 40307820, "step": 1863, "time_per_iteration": 2.6778175830841064 }, { "auxiliary_loss_clip": 0.01158667, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.05455208, "balance_loss_mlp": 1.03071773, "epoch": 0.11206974297309484, "flos": 21288312829440.0, "grad_norm": 2.8013407816012226, "language_loss": 0.64245486, "learning_rate": 3.929761337766945e-06, "loss": 0.66459453, "num_input_tokens_seen": 40327430, "step": 1864, "time_per_iteration": 2.724076509475708 }, { "auxiliary_loss_clip": 0.01110154, "auxiliary_loss_mlp": 0.01047933, "balance_loss_clip": 1.04924703, "balance_loss_mlp": 1.02672601, "epoch": 0.11212986622576282, "flos": 18915982151040.0, "grad_norm": 2.0303098144917135, "language_loss": 0.74043733, "learning_rate": 3.929658994039627e-06, "loss": 0.7620182, "num_input_tokens_seen": 40344545, "step": 1865, "time_per_iteration": 2.8119356632232666 }, { "auxiliary_loss_clip": 0.01114683, "auxiliary_loss_mlp": 0.01070203, "balance_loss_clip": 1.05348182, "balance_loss_mlp": 1.04483545, "epoch": 0.11218998947843078, "flos": 22054754257920.0, "grad_norm": 2.7389427033573375, "language_loss": 0.84692436, "learning_rate": 3.929556577139446e-06, "loss": 0.86877316, "num_input_tokens_seen": 40362300, "step": 1866, "time_per_iteration": 2.8022067546844482 }, { "auxiliary_loss_clip": 0.01092364, "auxiliary_loss_mlp": 0.00781014, "balance_loss_clip": 1.04227424, "balance_loss_mlp": 1.00006938, "epoch": 0.11225011273109875, "flos": 24571697091840.0, "grad_norm": 1.704208120094955, "language_loss": 0.8104012, "learning_rate": 3.929454087070286e-06, "loss": 0.82913494, "num_input_tokens_seen": 40384720, "step": 1867, "time_per_iteration": 2.915989875793457 }, { "auxiliary_loss_clip": 0.01179505, "auxiliary_loss_mlp": 0.01060529, "balance_loss_clip": 1.05720687, "balance_loss_mlp": 1.03959608, "epoch": 0.11231023598376672, "flos": 28438665621120.0, "grad_norm": 2.0811636681692844, "language_loss": 0.86840278, "learning_rate": 3.929351523836035e-06, "loss": 0.8908031, "num_input_tokens_seen": 40404000, "step": 1868, "time_per_iteration": 2.6855647563934326 }, { "auxiliary_loss_clip": 0.01161412, "auxiliary_loss_mlp": 0.00779977, "balance_loss_clip": 1.06005311, "balance_loss_mlp": 1.00010097, "epoch": 0.1123703592364347, "flos": 14426466076800.0, "grad_norm": 2.1491178409138376, "language_loss": 0.68308532, "learning_rate": 3.9292488874405795e-06, "loss": 0.70249927, "num_input_tokens_seen": 40418665, "step": 1869, "time_per_iteration": 2.7404487133026123 }, { "auxiliary_loss_clip": 0.01133783, "auxiliary_loss_mlp": 0.01066188, "balance_loss_clip": 1.04932964, "balance_loss_mlp": 1.04225063, "epoch": 0.11243048248910266, "flos": 22236282616320.0, "grad_norm": 1.5255545896853626, "language_loss": 0.76943326, "learning_rate": 3.929146177887814e-06, "loss": 0.79143298, "num_input_tokens_seen": 40437870, "step": 1870, "time_per_iteration": 2.809734344482422 }, { "auxiliary_loss_clip": 0.01129358, "auxiliary_loss_mlp": 0.01056867, "balance_loss_clip": 1.0509038, "balance_loss_mlp": 1.03300166, "epoch": 0.11249060574177062, "flos": 18584167288320.0, "grad_norm": 1.8186132867503446, "language_loss": 0.76056099, "learning_rate": 3.929043395181631e-06, "loss": 0.78242326, "num_input_tokens_seen": 40455570, "step": 1871, "time_per_iteration": 2.727161169052124 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.01051114, "balance_loss_clip": 1.04993379, "balance_loss_mlp": 1.03026426, "epoch": 0.1125507289944386, "flos": 22856567604480.0, "grad_norm": 1.9425066802508644, "language_loss": 0.81811988, "learning_rate": 3.928940539325929e-06, "loss": 0.83968765, "num_input_tokens_seen": 40473600, "step": 1872, "time_per_iteration": 2.851868152618408 }, { "auxiliary_loss_clip": 0.01179923, "auxiliary_loss_mlp": 0.01055722, "balance_loss_clip": 1.05722499, "balance_loss_mlp": 1.03359652, "epoch": 0.11261085224710657, "flos": 19676390094720.0, "grad_norm": 2.186176467187071, "language_loss": 0.8361913, "learning_rate": 3.9288376103246095e-06, "loss": 0.85854775, "num_input_tokens_seen": 40490025, "step": 1873, "time_per_iteration": 2.6668763160705566 }, { "auxiliary_loss_clip": 0.01144862, "auxiliary_loss_mlp": 0.01054726, "balance_loss_clip": 1.0525465, "balance_loss_mlp": 1.03196871, "epoch": 0.11267097549977453, "flos": 26063246373120.0, "grad_norm": 1.8822875514234196, "language_loss": 0.92342389, "learning_rate": 3.928734608181575e-06, "loss": 0.94541967, "num_input_tokens_seen": 40511580, "step": 1874, "time_per_iteration": 2.700533866882324 }, { "auxiliary_loss_clip": 0.01140327, "auxiliary_loss_mlp": 0.01056402, "balance_loss_clip": 1.05100179, "balance_loss_mlp": 1.03509891, "epoch": 0.11273109875244251, "flos": 21068036674560.0, "grad_norm": 1.6564425098873434, "language_loss": 0.75359404, "learning_rate": 3.928631532900729e-06, "loss": 0.77556133, "num_input_tokens_seen": 40530155, "step": 1875, "time_per_iteration": 2.7642719745635986 }, { "auxiliary_loss_clip": 0.01167091, "auxiliary_loss_mlp": 0.01055271, "balance_loss_clip": 1.05893159, "balance_loss_mlp": 1.0348264, "epoch": 0.11279122200511048, "flos": 27088999061760.0, "grad_norm": 2.12758140825061, "language_loss": 0.71578634, "learning_rate": 3.928528384485984e-06, "loss": 0.73800993, "num_input_tokens_seen": 40549500, "step": 1876, "time_per_iteration": 2.8505096435546875 }, { "auxiliary_loss_clip": 0.01147417, "auxiliary_loss_mlp": 0.01054094, "balance_loss_clip": 1.05223966, "balance_loss_mlp": 1.03200495, "epoch": 0.11285134525777844, "flos": 20187901722240.0, "grad_norm": 1.8103612630164048, "language_loss": 0.76795971, "learning_rate": 3.9284251629412475e-06, "loss": 0.78997481, "num_input_tokens_seen": 40567475, "step": 1877, "time_per_iteration": 2.6972849369049072 }, { "auxiliary_loss_clip": 0.01168106, "auxiliary_loss_mlp": 0.01063056, "balance_loss_clip": 1.05518627, "balance_loss_mlp": 1.04026341, "epoch": 0.11291146851044641, "flos": 12458453863680.0, "grad_norm": 2.1601834607000368, "language_loss": 0.87843502, "learning_rate": 3.928321868270436e-06, "loss": 0.90074658, "num_input_tokens_seen": 40583280, "step": 1878, "time_per_iteration": 5.6992692947387695 }, { "auxiliary_loss_clip": 0.01140682, "auxiliary_loss_mlp": 0.01054902, "balance_loss_clip": 1.05420399, "balance_loss_mlp": 1.03333724, "epoch": 0.11297159176311439, "flos": 23842315520640.0, "grad_norm": 2.151084139284284, "language_loss": 0.81623232, "learning_rate": 3.928218500477466e-06, "loss": 0.83818817, "num_input_tokens_seen": 40603080, "step": 1879, "time_per_iteration": 2.8688366413116455 }, { "auxiliary_loss_clip": 0.01155904, "auxiliary_loss_mlp": 0.01059079, "balance_loss_clip": 1.05238748, "balance_loss_mlp": 1.03609526, "epoch": 0.11303171501578235, "flos": 29930538124800.0, "grad_norm": 1.941623939252122, "language_loss": 0.70234305, "learning_rate": 3.928115059566259e-06, "loss": 0.72449279, "num_input_tokens_seen": 40623255, "step": 1880, "time_per_iteration": 5.567574739456177 }, { "auxiliary_loss_clip": 0.01155691, "auxiliary_loss_mlp": 0.01052309, "balance_loss_clip": 1.05585837, "balance_loss_mlp": 1.0306015, "epoch": 0.11309183826845032, "flos": 16180558842240.0, "grad_norm": 1.6696082535169858, "language_loss": 0.72690225, "learning_rate": 3.928011545540734e-06, "loss": 0.74898225, "num_input_tokens_seen": 40641570, "step": 1881, "time_per_iteration": 2.792428493499756 }, { "auxiliary_loss_clip": 0.011425, "auxiliary_loss_mlp": 0.00781179, "balance_loss_clip": 1.05046606, "balance_loss_mlp": 1.00008667, "epoch": 0.1131519615211183, "flos": 12020702814720.0, "grad_norm": 2.2964043184115783, "language_loss": 0.74205768, "learning_rate": 3.927907958404819e-06, "loss": 0.76129448, "num_input_tokens_seen": 40658775, "step": 1882, "time_per_iteration": 4.414916515350342 }, { "auxiliary_loss_clip": 0.01177281, "auxiliary_loss_mlp": 0.01054815, "balance_loss_clip": 1.05680335, "balance_loss_mlp": 1.03203452, "epoch": 0.11321208477378626, "flos": 26250125857920.0, "grad_norm": 2.4326158086005965, "language_loss": 0.7923016, "learning_rate": 3.92780429816244e-06, "loss": 0.81462252, "num_input_tokens_seen": 40679555, "step": 1883, "time_per_iteration": 2.762615919113159 }, { "auxiliary_loss_clip": 0.01140926, "auxiliary_loss_mlp": 0.01058465, "balance_loss_clip": 1.05226314, "balance_loss_mlp": 1.03520727, "epoch": 0.11327220802645423, "flos": 13626376583040.0, "grad_norm": 2.2898863699254974, "language_loss": 0.77047318, "learning_rate": 3.927700564817529e-06, "loss": 0.79246712, "num_input_tokens_seen": 40697295, "step": 1884, "time_per_iteration": 2.835468292236328 }, { "auxiliary_loss_clip": 0.01074478, "auxiliary_loss_mlp": 0.01009476, "balance_loss_clip": 1.03993821, "balance_loss_mlp": 1.00620937, "epoch": 0.1133323312791222, "flos": 57191802814080.0, "grad_norm": 0.8138652948403053, "language_loss": 0.55151373, "learning_rate": 3.927596758374019e-06, "loss": 0.5723533, "num_input_tokens_seen": 40758095, "step": 1885, "time_per_iteration": 3.179532289505005 }, { "auxiliary_loss_clip": 0.01083888, "auxiliary_loss_mlp": 0.01050751, "balance_loss_clip": 1.04415166, "balance_loss_mlp": 1.02910316, "epoch": 0.11339245453179017, "flos": 24351708245760.0, "grad_norm": 1.9836288003076585, "language_loss": 0.90384823, "learning_rate": 3.927492878835848e-06, "loss": 0.92519462, "num_input_tokens_seen": 40777140, "step": 1886, "time_per_iteration": 3.038928747177124 }, { "auxiliary_loss_clip": 0.01116325, "auxiliary_loss_mlp": 0.01057697, "balance_loss_clip": 1.05137897, "balance_loss_mlp": 1.03634632, "epoch": 0.11345257778445814, "flos": 22670693700480.0, "grad_norm": 2.0132756022974023, "language_loss": 0.84852886, "learning_rate": 3.927388926206953e-06, "loss": 0.87026906, "num_input_tokens_seen": 40797505, "step": 1887, "time_per_iteration": 3.178863048553467 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.01056557, "balance_loss_clip": 1.05091035, "balance_loss_mlp": 1.03549314, "epoch": 0.11351270103712612, "flos": 20988242611200.0, "grad_norm": 2.847610033990257, "language_loss": 0.75826252, "learning_rate": 3.927284900491277e-06, "loss": 0.78019381, "num_input_tokens_seen": 40812970, "step": 1888, "time_per_iteration": 2.7349846363067627 }, { "auxiliary_loss_clip": 0.0113463, "auxiliary_loss_mlp": 0.01062359, "balance_loss_clip": 1.05614805, "balance_loss_mlp": 1.03892243, "epoch": 0.11357282428979408, "flos": 37347923600640.0, "grad_norm": 2.0598279187313624, "language_loss": 0.68104899, "learning_rate": 3.927180801692764e-06, "loss": 0.7030189, "num_input_tokens_seen": 40837745, "step": 1889, "time_per_iteration": 3.144444465637207 }, { "auxiliary_loss_clip": 0.01177206, "auxiliary_loss_mlp": 0.01049162, "balance_loss_clip": 1.05653095, "balance_loss_mlp": 1.02694094, "epoch": 0.11363294754246205, "flos": 21757018423680.0, "grad_norm": 1.7896678692754837, "language_loss": 0.83947051, "learning_rate": 3.927076629815362e-06, "loss": 0.86173415, "num_input_tokens_seen": 40856490, "step": 1890, "time_per_iteration": 2.73126482963562 }, { "auxiliary_loss_clip": 0.01145149, "auxiliary_loss_mlp": 0.01056017, "balance_loss_clip": 1.05039728, "balance_loss_mlp": 1.03395164, "epoch": 0.11369307079513001, "flos": 22601637803520.0, "grad_norm": 2.1678723202845256, "language_loss": 0.64663875, "learning_rate": 3.926972384863022e-06, "loss": 0.66865045, "num_input_tokens_seen": 40874070, "step": 1891, "time_per_iteration": 2.7474160194396973 }, { "auxiliary_loss_clip": 0.01145505, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.05395687, "balance_loss_mlp": 1.02773631, "epoch": 0.11375319404779799, "flos": 21944257044480.0, "grad_norm": 2.126575023047711, "language_loss": 0.87889415, "learning_rate": 3.9268680668396956e-06, "loss": 0.90083933, "num_input_tokens_seen": 40892425, "step": 1892, "time_per_iteration": 2.795269250869751 }, { "auxiliary_loss_clip": 0.01119535, "auxiliary_loss_mlp": 0.01079586, "balance_loss_clip": 1.05541015, "balance_loss_mlp": 1.05461168, "epoch": 0.11381331730046595, "flos": 26395456285440.0, "grad_norm": 3.1806920305576973, "language_loss": 0.72902197, "learning_rate": 3.926763675749339e-06, "loss": 0.75101316, "num_input_tokens_seen": 40912190, "step": 1893, "time_per_iteration": 2.890289306640625 }, { "auxiliary_loss_clip": 0.01175698, "auxiliary_loss_mlp": 0.0106591, "balance_loss_clip": 1.05438137, "balance_loss_mlp": 1.04290223, "epoch": 0.11387344055313392, "flos": 23804716959360.0, "grad_norm": 1.8842571229841023, "language_loss": 0.79247093, "learning_rate": 3.92665921159591e-06, "loss": 0.81488699, "num_input_tokens_seen": 40928395, "step": 1894, "time_per_iteration": 2.6820743083953857 }, { "auxiliary_loss_clip": 0.01150233, "auxiliary_loss_mlp": 0.01061956, "balance_loss_clip": 1.05356526, "balance_loss_mlp": 1.03944933, "epoch": 0.1139335638058019, "flos": 34522865902080.0, "grad_norm": 3.429237983174195, "language_loss": 0.79718482, "learning_rate": 3.926554674383371e-06, "loss": 0.81930667, "num_input_tokens_seen": 40946555, "step": 1895, "time_per_iteration": 2.829946994781494 }, { "auxiliary_loss_clip": 0.01075529, "auxiliary_loss_mlp": 0.01018518, "balance_loss_clip": 1.03062391, "balance_loss_mlp": 1.0155375, "epoch": 0.11399368705846986, "flos": 70587811520640.0, "grad_norm": 0.8041110638842961, "language_loss": 0.63357508, "learning_rate": 3.926450064115686e-06, "loss": 0.65451556, "num_input_tokens_seen": 41004910, "step": 1896, "time_per_iteration": 3.3087315559387207 }, { "auxiliary_loss_clip": 0.01147265, "auxiliary_loss_mlp": 0.0106086, "balance_loss_clip": 1.05560398, "balance_loss_mlp": 1.03663635, "epoch": 0.11405381031113783, "flos": 21324259365120.0, "grad_norm": 1.5952307342327186, "language_loss": 0.85055745, "learning_rate": 3.926345380796821e-06, "loss": 0.8726387, "num_input_tokens_seen": 41026385, "step": 1897, "time_per_iteration": 2.8522274494171143 }, { "auxiliary_loss_clip": 0.0117836, "auxiliary_loss_mlp": 0.00780276, "balance_loss_clip": 1.05591989, "balance_loss_mlp": 1.0001986, "epoch": 0.11411393356380581, "flos": 19719627091200.0, "grad_norm": 3.3624139627125587, "language_loss": 0.79675245, "learning_rate": 3.9262406244307465e-06, "loss": 0.81633884, "num_input_tokens_seen": 41045315, "step": 1898, "time_per_iteration": 2.760057210922241 }, { "auxiliary_loss_clip": 0.01115338, "auxiliary_loss_mlp": 0.01064417, "balance_loss_clip": 1.04594529, "balance_loss_mlp": 1.03965724, "epoch": 0.11417405681647377, "flos": 17530440883200.0, "grad_norm": 2.0191769665152903, "language_loss": 0.73251313, "learning_rate": 3.926135795021435e-06, "loss": 0.75431061, "num_input_tokens_seen": 41063390, "step": 1899, "time_per_iteration": 2.7363204956054688 }, { "auxiliary_loss_clip": 0.01042449, "auxiliary_loss_mlp": 0.01003313, "balance_loss_clip": 1.03643703, "balance_loss_mlp": 1.0003922, "epoch": 0.11423418006914174, "flos": 59674666619520.0, "grad_norm": 0.9089505356695228, "language_loss": 0.63434029, "learning_rate": 3.92603089257286e-06, "loss": 0.65479791, "num_input_tokens_seen": 41124180, "step": 1900, "time_per_iteration": 3.2045955657958984 }, { "auxiliary_loss_clip": 0.01113626, "auxiliary_loss_mlp": 0.01066815, "balance_loss_clip": 1.04929233, "balance_loss_mlp": 1.04378414, "epoch": 0.1142943033218097, "flos": 22963114321920.0, "grad_norm": 1.577500478750639, "language_loss": 0.77943742, "learning_rate": 3.925925917089001e-06, "loss": 0.80124187, "num_input_tokens_seen": 41143485, "step": 1901, "time_per_iteration": 2.745089530944824 }, { "auxiliary_loss_clip": 0.01171621, "auxiliary_loss_mlp": 0.01057834, "balance_loss_clip": 1.05803061, "balance_loss_mlp": 1.0359118, "epoch": 0.11435442657447768, "flos": 18256267008000.0, "grad_norm": 2.175933638179557, "language_loss": 0.84158623, "learning_rate": 3.925820868573839e-06, "loss": 0.86388075, "num_input_tokens_seen": 41161695, "step": 1902, "time_per_iteration": 2.6433799266815186 }, { "auxiliary_loss_clip": 0.01159941, "auxiliary_loss_mlp": 0.01056662, "balance_loss_clip": 1.05280399, "balance_loss_mlp": 1.03122306, "epoch": 0.11441454982714565, "flos": 24061191045120.0, "grad_norm": 1.7702735053047673, "language_loss": 0.77720451, "learning_rate": 3.925715747031356e-06, "loss": 0.79937053, "num_input_tokens_seen": 41181715, "step": 1903, "time_per_iteration": 2.6385905742645264 }, { "auxiliary_loss_clip": 0.01145143, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.05293322, "balance_loss_mlp": 1.02174175, "epoch": 0.11447467307981361, "flos": 25337707557120.0, "grad_norm": 2.212790565732917, "language_loss": 0.75751555, "learning_rate": 3.925610552465539e-06, "loss": 0.77938658, "num_input_tokens_seen": 41201770, "step": 1904, "time_per_iteration": 2.632152557373047 }, { "auxiliary_loss_clip": 0.01149375, "auxiliary_loss_mlp": 0.01056532, "balance_loss_clip": 1.05207586, "balance_loss_mlp": 1.03279781, "epoch": 0.11453479633248159, "flos": 21726063878400.0, "grad_norm": 2.4422699353972006, "language_loss": 0.91853034, "learning_rate": 3.9255052848803764e-06, "loss": 0.94058943, "num_input_tokens_seen": 41220590, "step": 1905, "time_per_iteration": 2.7421486377716064 }, { "auxiliary_loss_clip": 0.01161686, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.04978943, "balance_loss_mlp": 1.02612448, "epoch": 0.11459491958514956, "flos": 12969714096000.0, "grad_norm": 2.5117992419356066, "language_loss": 0.77484202, "learning_rate": 3.925399944279861e-06, "loss": 0.79696143, "num_input_tokens_seen": 41237250, "step": 1906, "time_per_iteration": 2.69333553314209 }, { "auxiliary_loss_clip": 0.0117911, "auxiliary_loss_mlp": 0.01055129, "balance_loss_clip": 1.05697322, "balance_loss_mlp": 1.03222847, "epoch": 0.11465504283781752, "flos": 22711273090560.0, "grad_norm": 2.0720467666322113, "language_loss": 0.81739306, "learning_rate": 3.925294530667986e-06, "loss": 0.83973539, "num_input_tokens_seen": 41256680, "step": 1907, "time_per_iteration": 2.6531317234039307 }, { "auxiliary_loss_clip": 0.0113647, "auxiliary_loss_mlp": 0.01065473, "balance_loss_clip": 1.05235374, "balance_loss_mlp": 1.04227471, "epoch": 0.1147151660904855, "flos": 23398387332480.0, "grad_norm": 2.1769364553121293, "language_loss": 0.84901214, "learning_rate": 3.92518904404875e-06, "loss": 0.87103164, "num_input_tokens_seen": 41270955, "step": 1908, "time_per_iteration": 2.8768258094787598 }, { "auxiliary_loss_clip": 0.01029536, "auxiliary_loss_mlp": 0.01020856, "balance_loss_clip": 1.02524137, "balance_loss_mlp": 1.01694632, "epoch": 0.11477528934315347, "flos": 63011843498880.0, "grad_norm": 0.9197306473097341, "language_loss": 0.61072773, "learning_rate": 3.925083484426153e-06, "loss": 0.63123173, "num_input_tokens_seen": 41319180, "step": 1909, "time_per_iteration": 3.0845727920532227 }, { "auxiliary_loss_clip": 0.01182744, "auxiliary_loss_mlp": 0.01054075, "balance_loss_clip": 1.06014562, "balance_loss_mlp": 1.03219986, "epoch": 0.11483541259582143, "flos": 16325601960960.0, "grad_norm": 7.319166590530674, "language_loss": 0.79170966, "learning_rate": 3.924977851804197e-06, "loss": 0.81407785, "num_input_tokens_seen": 41337480, "step": 1910, "time_per_iteration": 2.708704710006714 }, { "auxiliary_loss_clip": 0.01156489, "auxiliary_loss_mlp": 0.01052406, "balance_loss_clip": 1.0580864, "balance_loss_mlp": 1.03029275, "epoch": 0.1148955358484894, "flos": 21580410228480.0, "grad_norm": 2.117911712245717, "language_loss": 0.7702589, "learning_rate": 3.9248721461868875e-06, "loss": 0.79234779, "num_input_tokens_seen": 41354650, "step": 1911, "time_per_iteration": 2.7597720623016357 }, { "auxiliary_loss_clip": 0.01159986, "auxiliary_loss_mlp": 0.01054599, "balance_loss_clip": 1.05726957, "balance_loss_mlp": 1.03227139, "epoch": 0.11495565910115738, "flos": 27673696650240.0, "grad_norm": 1.677508784227342, "language_loss": 0.79177421, "learning_rate": 3.9247663675782336e-06, "loss": 0.81392002, "num_input_tokens_seen": 41376935, "step": 1912, "time_per_iteration": 2.8143310546875 }, { "auxiliary_loss_clip": 0.01183047, "auxiliary_loss_mlp": 0.00779659, "balance_loss_clip": 1.06065917, "balance_loss_mlp": 1.00014925, "epoch": 0.11501578235382534, "flos": 20632368614400.0, "grad_norm": 2.291252405113977, "language_loss": 0.77942276, "learning_rate": 3.924660515982246e-06, "loss": 0.79904979, "num_input_tokens_seen": 41396105, "step": 1913, "time_per_iteration": 2.696430206298828 }, { "auxiliary_loss_clip": 0.01166892, "auxiliary_loss_mlp": 0.01052769, "balance_loss_clip": 1.05442226, "balance_loss_mlp": 1.02953506, "epoch": 0.1150759056064933, "flos": 19829046896640.0, "grad_norm": 1.8145547055361753, "language_loss": 0.7003395, "learning_rate": 3.924554591402939e-06, "loss": 0.72253609, "num_input_tokens_seen": 41415600, "step": 1914, "time_per_iteration": 2.739251136779785 }, { "auxiliary_loss_clip": 0.01007182, "auxiliary_loss_mlp": 0.01004682, "balance_loss_clip": 1.02677619, "balance_loss_mlp": 1.00191641, "epoch": 0.11513602885916129, "flos": 70045776311040.0, "grad_norm": 0.7558771871458172, "language_loss": 0.61059874, "learning_rate": 3.92444859384433e-06, "loss": 0.6307174, "num_input_tokens_seen": 41478760, "step": 1915, "time_per_iteration": 3.56019926071167 }, { "auxiliary_loss_clip": 0.01166434, "auxiliary_loss_mlp": 0.01058573, "balance_loss_clip": 1.05994964, "balance_loss_mlp": 1.03595936, "epoch": 0.11519615211182925, "flos": 15741730385280.0, "grad_norm": 2.437201506258279, "language_loss": 0.93116963, "learning_rate": 3.924342523310436e-06, "loss": 0.95341969, "num_input_tokens_seen": 41495720, "step": 1916, "time_per_iteration": 3.244772434234619 }, { "auxiliary_loss_clip": 0.01161132, "auxiliary_loss_mlp": 0.01059827, "balance_loss_clip": 1.05798697, "balance_loss_mlp": 1.03470993, "epoch": 0.11525627536449722, "flos": 20667632791680.0, "grad_norm": 1.8909260082350545, "language_loss": 0.72560197, "learning_rate": 3.9242363798052806e-06, "loss": 0.74781156, "num_input_tokens_seen": 41513585, "step": 1917, "time_per_iteration": 4.502236843109131 }, { "auxiliary_loss_clip": 0.01138773, "auxiliary_loss_mlp": 0.0104964, "balance_loss_clip": 1.05739903, "balance_loss_mlp": 1.02700245, "epoch": 0.1153163986171652, "flos": 20303283185280.0, "grad_norm": 9.147356795176979, "language_loss": 0.74213129, "learning_rate": 3.92413016333289e-06, "loss": 0.76401544, "num_input_tokens_seen": 41533390, "step": 1918, "time_per_iteration": 4.344711065292358 }, { "auxiliary_loss_clip": 0.0114898, "auxiliary_loss_mlp": 0.010469, "balance_loss_clip": 1.05532503, "balance_loss_mlp": 1.02450073, "epoch": 0.11537652186983316, "flos": 17639321984640.0, "grad_norm": 3.182152136597976, "language_loss": 0.86367452, "learning_rate": 3.92402387389729e-06, "loss": 0.88563335, "num_input_tokens_seen": 41551015, "step": 1919, "time_per_iteration": 4.540036201477051 }, { "auxiliary_loss_clip": 0.01134044, "auxiliary_loss_mlp": 0.01067867, "balance_loss_clip": 1.0496366, "balance_loss_mlp": 1.04172444, "epoch": 0.11543664512250112, "flos": 21069401391360.0, "grad_norm": 1.93595243799445, "language_loss": 0.86735415, "learning_rate": 3.923917511502512e-06, "loss": 0.8893733, "num_input_tokens_seen": 41568055, "step": 1920, "time_per_iteration": 2.7719242572784424 }, { "auxiliary_loss_clip": 0.011686, "auxiliary_loss_mlp": 0.010528, "balance_loss_clip": 1.0593946, "balance_loss_mlp": 1.0302341, "epoch": 0.11549676837516909, "flos": 22747542848640.0, "grad_norm": 4.512761907267092, "language_loss": 0.79294932, "learning_rate": 3.923811076152589e-06, "loss": 0.81516337, "num_input_tokens_seen": 41587435, "step": 1921, "time_per_iteration": 2.798673629760742 }, { "auxiliary_loss_clip": 0.01174604, "auxiliary_loss_mlp": 0.01063526, "balance_loss_clip": 1.05685806, "balance_loss_mlp": 1.04007721, "epoch": 0.11555689162783707, "flos": 19168972617600.0, "grad_norm": 2.4057040360661484, "language_loss": 0.78464305, "learning_rate": 3.923704567851557e-06, "loss": 0.80702436, "num_input_tokens_seen": 41604975, "step": 1922, "time_per_iteration": 4.352341651916504 }, { "auxiliary_loss_clip": 0.01092284, "auxiliary_loss_mlp": 0.01064602, "balance_loss_clip": 1.04645681, "balance_loss_mlp": 1.04229808, "epoch": 0.11561701488050503, "flos": 24572056227840.0, "grad_norm": 1.8560991769949675, "language_loss": 0.84293079, "learning_rate": 3.923597986603456e-06, "loss": 0.86449969, "num_input_tokens_seen": 41626155, "step": 1923, "time_per_iteration": 3.2956740856170654 }, { "auxiliary_loss_clip": 0.01171957, "auxiliary_loss_mlp": 0.01056739, "balance_loss_clip": 1.0600003, "balance_loss_mlp": 1.03317094, "epoch": 0.115677138133173, "flos": 17092546179840.0, "grad_norm": 1.944851076041885, "language_loss": 0.80890471, "learning_rate": 3.9234913324123264e-06, "loss": 0.83119166, "num_input_tokens_seen": 41644805, "step": 1924, "time_per_iteration": 3.0939247608184814 }, { "auxiliary_loss_clip": 0.01055916, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 1.03045607, "balance_loss_mlp": 1.02436543, "epoch": 0.11573726138584098, "flos": 62703875266560.0, "grad_norm": 0.8171642061509322, "language_loss": 0.61196578, "learning_rate": 3.923384605282212e-06, "loss": 0.63279623, "num_input_tokens_seen": 41709345, "step": 1925, "time_per_iteration": 3.3765265941619873 }, { "auxiliary_loss_clip": 0.01155845, "auxiliary_loss_mlp": 0.01079328, "balance_loss_clip": 1.05374098, "balance_loss_mlp": 1.0549382, "epoch": 0.11579738463850894, "flos": 22601135013120.0, "grad_norm": 1.7772533553430212, "language_loss": 0.74766397, "learning_rate": 3.923277805217161e-06, "loss": 0.77001572, "num_input_tokens_seen": 41730210, "step": 1926, "time_per_iteration": 2.754974126815796 }, { "auxiliary_loss_clip": 0.01116228, "auxiliary_loss_mlp": 0.00781701, "balance_loss_clip": 1.04683304, "balance_loss_mlp": 1.00016665, "epoch": 0.11585750789117691, "flos": 21726135705600.0, "grad_norm": 4.731879086182685, "language_loss": 0.71978599, "learning_rate": 3.923170932221222e-06, "loss": 0.7387653, "num_input_tokens_seen": 41750270, "step": 1927, "time_per_iteration": 2.9454004764556885 }, { "auxiliary_loss_clip": 0.01137955, "auxiliary_loss_mlp": 0.01058796, "balance_loss_clip": 1.05250621, "balance_loss_mlp": 1.03572917, "epoch": 0.11591763114384489, "flos": 26287544851200.0, "grad_norm": 1.5938674022456252, "language_loss": 0.86854041, "learning_rate": 3.92306398629845e-06, "loss": 0.89050794, "num_input_tokens_seen": 41772975, "step": 1928, "time_per_iteration": 2.832750082015991 }, { "auxiliary_loss_clip": 0.01129041, "auxiliary_loss_mlp": 0.01060836, "balance_loss_clip": 1.05032003, "balance_loss_mlp": 1.03706551, "epoch": 0.11597775439651285, "flos": 23000461488000.0, "grad_norm": 1.6639520350020578, "language_loss": 0.77450585, "learning_rate": 3.922956967452898e-06, "loss": 0.79640466, "num_input_tokens_seen": 41791765, "step": 1929, "time_per_iteration": 2.7876811027526855 }, { "auxiliary_loss_clip": 0.01176887, "auxiliary_loss_mlp": 0.01063611, "balance_loss_clip": 1.05667901, "balance_loss_mlp": 1.0424509, "epoch": 0.11603787764918082, "flos": 31941715507200.0, "grad_norm": 1.8085677541874856, "language_loss": 0.76831949, "learning_rate": 3.922849875688626e-06, "loss": 0.79072452, "num_input_tokens_seen": 41815615, "step": 1930, "time_per_iteration": 2.819934844970703 }, { "auxiliary_loss_clip": 0.01145781, "auxiliary_loss_mlp": 0.01054046, "balance_loss_clip": 1.05066586, "balance_loss_mlp": 1.03165817, "epoch": 0.1160980009018488, "flos": 22271654534400.0, "grad_norm": 1.9434791543130712, "language_loss": 0.72291863, "learning_rate": 3.922742711009693e-06, "loss": 0.74491692, "num_input_tokens_seen": 41834810, "step": 1931, "time_per_iteration": 2.8078088760375977 }, { "auxiliary_loss_clip": 0.01146409, "auxiliary_loss_mlp": 0.01061336, "balance_loss_clip": 1.05090261, "balance_loss_mlp": 1.03575325, "epoch": 0.11615812415451676, "flos": 22783633038720.0, "grad_norm": 1.7378937044391531, "language_loss": 0.8222791, "learning_rate": 3.922635473420164e-06, "loss": 0.8443566, "num_input_tokens_seen": 41854975, "step": 1932, "time_per_iteration": 2.7495200634002686 }, { "auxiliary_loss_clip": 0.01030493, "auxiliary_loss_mlp": 0.01018834, "balance_loss_clip": 1.02184403, "balance_loss_mlp": 1.01556778, "epoch": 0.11621824740718473, "flos": 67146096107520.0, "grad_norm": 0.7669378012870447, "language_loss": 0.61050332, "learning_rate": 3.922528162924105e-06, "loss": 0.63099658, "num_input_tokens_seen": 41911105, "step": 1933, "time_per_iteration": 3.256678581237793 }, { "auxiliary_loss_clip": 0.01108577, "auxiliary_loss_mlp": 0.00780156, "balance_loss_clip": 1.04764509, "balance_loss_mlp": 1.00006175, "epoch": 0.11627837065985269, "flos": 20375930442240.0, "grad_norm": 2.830760437639296, "language_loss": 0.85790741, "learning_rate": 3.922420779525586e-06, "loss": 0.8767947, "num_input_tokens_seen": 41931750, "step": 1934, "time_per_iteration": 2.9144253730773926 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01059839, "balance_loss_clip": 1.04929256, "balance_loss_mlp": 1.03453088, "epoch": 0.11633849391252067, "flos": 21725812483200.0, "grad_norm": 2.625764216143105, "language_loss": 0.66222906, "learning_rate": 3.9223133232286776e-06, "loss": 0.68400419, "num_input_tokens_seen": 41949400, "step": 1935, "time_per_iteration": 2.867152452468872 }, { "auxiliary_loss_clip": 0.01183991, "auxiliary_loss_mlp": 0.01052492, "balance_loss_clip": 1.05868936, "balance_loss_mlp": 1.03111792, "epoch": 0.11639861716518864, "flos": 18805341283200.0, "grad_norm": 2.025938843377603, "language_loss": 0.75678742, "learning_rate": 3.922205794037456e-06, "loss": 0.77915227, "num_input_tokens_seen": 41968100, "step": 1936, "time_per_iteration": 2.7282185554504395 }, { "auxiliary_loss_clip": 0.01179718, "auxiliary_loss_mlp": 0.01049532, "balance_loss_clip": 1.05632091, "balance_loss_mlp": 1.02639306, "epoch": 0.1164587404178566, "flos": 21214983214080.0, "grad_norm": 2.0032002399718905, "language_loss": 0.84086847, "learning_rate": 3.922098191955998e-06, "loss": 0.86316097, "num_input_tokens_seen": 41986375, "step": 1937, "time_per_iteration": 2.715386152267456 }, { "auxiliary_loss_clip": 0.01152084, "auxiliary_loss_mlp": 0.01048961, "balance_loss_clip": 1.05258632, "balance_loss_mlp": 1.0268234, "epoch": 0.11651886367052458, "flos": 27818632028160.0, "grad_norm": 3.0485930101216607, "language_loss": 0.7617709, "learning_rate": 3.921990516988384e-06, "loss": 0.78378135, "num_input_tokens_seen": 42006055, "step": 1938, "time_per_iteration": 2.7624804973602295 }, { "auxiliary_loss_clip": 0.01182576, "auxiliary_loss_mlp": 0.01055104, "balance_loss_clip": 1.05742419, "balance_loss_mlp": 1.03250146, "epoch": 0.11657898692319255, "flos": 22889569224960.0, "grad_norm": 1.7682499083089231, "language_loss": 0.79677606, "learning_rate": 3.921882769138696e-06, "loss": 0.81915289, "num_input_tokens_seen": 42024995, "step": 1939, "time_per_iteration": 2.71458101272583 }, { "auxiliary_loss_clip": 0.01148291, "auxiliary_loss_mlp": 0.01057951, "balance_loss_clip": 1.05209351, "balance_loss_mlp": 1.03508627, "epoch": 0.11663911017586051, "flos": 24315905364480.0, "grad_norm": 2.2281245193552475, "language_loss": 0.85916591, "learning_rate": 3.9217749484110215e-06, "loss": 0.88122833, "num_input_tokens_seen": 42042640, "step": 1940, "time_per_iteration": 2.7322728633880615 }, { "auxiliary_loss_clip": 0.01153746, "auxiliary_loss_mlp": 0.01056301, "balance_loss_clip": 1.05659437, "balance_loss_mlp": 1.03548717, "epoch": 0.11669923342852849, "flos": 42340152470400.0, "grad_norm": 1.4952807995381137, "language_loss": 0.75590646, "learning_rate": 3.921667054809449e-06, "loss": 0.77800703, "num_input_tokens_seen": 42067005, "step": 1941, "time_per_iteration": 2.9211390018463135 }, { "auxiliary_loss_clip": 0.01149585, "auxiliary_loss_mlp": 0.00780203, "balance_loss_clip": 1.05181897, "balance_loss_mlp": 1.00006557, "epoch": 0.11675935668119646, "flos": 14642288945280.0, "grad_norm": 2.277225749463833, "language_loss": 0.88847101, "learning_rate": 3.921559088338068e-06, "loss": 0.90776885, "num_input_tokens_seen": 42082295, "step": 1942, "time_per_iteration": 2.7145469188690186 }, { "auxiliary_loss_clip": 0.01165183, "auxiliary_loss_mlp": 0.01056257, "balance_loss_clip": 1.05553317, "balance_loss_mlp": 1.03552663, "epoch": 0.11681947993386442, "flos": 35116470063360.0, "grad_norm": 1.6547450593003057, "language_loss": 0.67979252, "learning_rate": 3.921451049000975e-06, "loss": 0.70200694, "num_input_tokens_seen": 42105295, "step": 1943, "time_per_iteration": 2.789701461791992 }, { "auxiliary_loss_clip": 0.01153022, "auxiliary_loss_mlp": 0.01047648, "balance_loss_clip": 1.05515063, "balance_loss_mlp": 1.02591634, "epoch": 0.11687960318653239, "flos": 38983259024640.0, "grad_norm": 1.9817763000300312, "language_loss": 0.69831288, "learning_rate": 3.921342936802265e-06, "loss": 0.72031963, "num_input_tokens_seen": 42125520, "step": 1944, "time_per_iteration": 2.827150583267212 }, { "auxiliary_loss_clip": 0.01155915, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.05038309, "balance_loss_mlp": 1.03158641, "epoch": 0.11693972643920036, "flos": 25994980575360.0, "grad_norm": 1.4963028532298175, "language_loss": 0.82662582, "learning_rate": 3.921234751746038e-06, "loss": 0.84870374, "num_input_tokens_seen": 42146335, "step": 1945, "time_per_iteration": 2.7190194129943848 }, { "auxiliary_loss_clip": 0.01137101, "auxiliary_loss_mlp": 0.01062082, "balance_loss_clip": 1.04682803, "balance_loss_mlp": 1.04005265, "epoch": 0.11699984969186833, "flos": 27272107618560.0, "grad_norm": 2.3643045784637735, "language_loss": 0.76298034, "learning_rate": 3.9211264938363975e-06, "loss": 0.78497219, "num_input_tokens_seen": 42165320, "step": 1946, "time_per_iteration": 2.792555093765259 }, { "auxiliary_loss_clip": 0.01134728, "auxiliary_loss_mlp": 0.01056112, "balance_loss_clip": 1.0507704, "balance_loss_mlp": 1.03536999, "epoch": 0.1170599729445363, "flos": 15267853232640.0, "grad_norm": 2.058923240355934, "language_loss": 0.69014907, "learning_rate": 3.921018163077448e-06, "loss": 0.71205747, "num_input_tokens_seen": 42182955, "step": 1947, "time_per_iteration": 2.643807888031006 }, { "auxiliary_loss_clip": 0.01154759, "auxiliary_loss_mlp": 0.01067767, "balance_loss_clip": 1.05707347, "balance_loss_mlp": 1.04604673, "epoch": 0.11712009619720427, "flos": 17164439251200.0, "grad_norm": 2.0690991629011615, "language_loss": 0.85044622, "learning_rate": 3.920909759473295e-06, "loss": 0.87267148, "num_input_tokens_seen": 42200760, "step": 1948, "time_per_iteration": 2.6399292945861816 }, { "auxiliary_loss_clip": 0.01051031, "auxiliary_loss_mlp": 0.0075782, "balance_loss_clip": 1.0245688, "balance_loss_mlp": 0.99997467, "epoch": 0.11718021944987224, "flos": 70940991997440.0, "grad_norm": 0.8206821069070506, "language_loss": 0.65139282, "learning_rate": 3.920801283028054e-06, "loss": 0.66948134, "num_input_tokens_seen": 42265745, "step": 1949, "time_per_iteration": 3.3030900955200195 }, { "auxiliary_loss_clip": 0.01159399, "auxiliary_loss_mlp": 0.01061163, "balance_loss_clip": 1.05735683, "balance_loss_mlp": 1.04054022, "epoch": 0.1172403427025402, "flos": 27453456408960.0, "grad_norm": 1.512876015443777, "language_loss": 0.71746683, "learning_rate": 3.920692733745835e-06, "loss": 0.73967248, "num_input_tokens_seen": 42286245, "step": 1950, "time_per_iteration": 2.739341974258423 }, { "auxiliary_loss_clip": 0.01175731, "auxiliary_loss_mlp": 0.01061149, "balance_loss_clip": 1.06152189, "balance_loss_mlp": 1.03907192, "epoch": 0.11730046595520818, "flos": 15668723992320.0, "grad_norm": 2.1258853115079996, "language_loss": 0.76671386, "learning_rate": 3.920584111630755e-06, "loss": 0.78908259, "num_input_tokens_seen": 42302710, "step": 1951, "time_per_iteration": 2.624788999557495 }, { "auxiliary_loss_clip": 0.01129104, "auxiliary_loss_mlp": 0.0106562, "balance_loss_clip": 1.05285251, "balance_loss_mlp": 1.04435349, "epoch": 0.11736058920787615, "flos": 25630164092160.0, "grad_norm": 1.7264952730121887, "language_loss": 0.75964963, "learning_rate": 3.9204754166869325e-06, "loss": 0.7815969, "num_input_tokens_seen": 42324115, "step": 1952, "time_per_iteration": 2.824826955795288 }, { "auxiliary_loss_clip": 0.01123677, "auxiliary_loss_mlp": 0.01065929, "balance_loss_clip": 1.04589534, "balance_loss_mlp": 1.04451907, "epoch": 0.11742071246054411, "flos": 21434289701760.0, "grad_norm": 2.2111022500713453, "language_loss": 0.72316217, "learning_rate": 3.920366648918491e-06, "loss": 0.74505818, "num_input_tokens_seen": 42342505, "step": 1953, "time_per_iteration": 2.7456531524658203 }, { "auxiliary_loss_clip": 0.01149214, "auxiliary_loss_mlp": 0.00781136, "balance_loss_clip": 1.0549686, "balance_loss_mlp": 1.0000577, "epoch": 0.11748083571321208, "flos": 15997845335040.0, "grad_norm": 2.1208802652878522, "language_loss": 0.79780388, "learning_rate": 3.920257808329552e-06, "loss": 0.81710744, "num_input_tokens_seen": 42360525, "step": 1954, "time_per_iteration": 2.653949737548828 }, { "auxiliary_loss_clip": 0.01112399, "auxiliary_loss_mlp": 0.01059787, "balance_loss_clip": 1.04880822, "balance_loss_mlp": 1.03763783, "epoch": 0.11754095896588006, "flos": 16180056051840.0, "grad_norm": 1.9673692595826442, "language_loss": 0.8553021, "learning_rate": 3.920148894924246e-06, "loss": 0.87702394, "num_input_tokens_seen": 42377045, "step": 1955, "time_per_iteration": 2.7987124919891357 }, { "auxiliary_loss_clip": 0.01163172, "auxiliary_loss_mlp": 0.00779783, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.00016606, "epoch": 0.11760108221854802, "flos": 13261596013440.0, "grad_norm": 2.12926288831445, "language_loss": 0.78105426, "learning_rate": 3.920039908706701e-06, "loss": 0.80048382, "num_input_tokens_seen": 42393960, "step": 1956, "time_per_iteration": 2.6247944831848145 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01058454, "balance_loss_clip": 1.05559933, "balance_loss_mlp": 1.03601909, "epoch": 0.11766120547121599, "flos": 24498439303680.0, "grad_norm": 2.264983200322237, "language_loss": 0.80487299, "learning_rate": 3.91993084968105e-06, "loss": 0.82704043, "num_input_tokens_seen": 42413160, "step": 1957, "time_per_iteration": 5.862411260604858 }, { "auxiliary_loss_clip": 0.01168294, "auxiliary_loss_mlp": 0.0105259, "balance_loss_clip": 1.05703866, "balance_loss_mlp": 1.0308696, "epoch": 0.11772132872388397, "flos": 17784005967360.0, "grad_norm": 4.8672025609093215, "language_loss": 0.77955222, "learning_rate": 3.919821717851428e-06, "loss": 0.80176103, "num_input_tokens_seen": 42432590, "step": 1958, "time_per_iteration": 4.4218549728393555 }, { "auxiliary_loss_clip": 0.01149976, "auxiliary_loss_mlp": 0.0105003, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.02680755, "epoch": 0.11778145197655193, "flos": 13217030213760.0, "grad_norm": 1.7537692363765556, "language_loss": 0.77002251, "learning_rate": 3.919712513221976e-06, "loss": 0.79202259, "num_input_tokens_seen": 42450135, "step": 1959, "time_per_iteration": 2.674323558807373 }, { "auxiliary_loss_clip": 0.01162585, "auxiliary_loss_mlp": 0.01057019, "balance_loss_clip": 1.05857027, "balance_loss_mlp": 1.03484631, "epoch": 0.1178415752292199, "flos": 20230204965120.0, "grad_norm": 2.2026367524708927, "language_loss": 0.70078689, "learning_rate": 3.919603235796832e-06, "loss": 0.722983, "num_input_tokens_seen": 42470050, "step": 1960, "time_per_iteration": 2.7704508304595947 }, { "auxiliary_loss_clip": 0.01161089, "auxiliary_loss_mlp": 0.01055224, "balance_loss_clip": 1.05841374, "balance_loss_mlp": 1.03228831, "epoch": 0.11790169848188788, "flos": 13040134709760.0, "grad_norm": 2.663996374773888, "language_loss": 0.81045067, "learning_rate": 3.9194938855801406e-06, "loss": 0.83261371, "num_input_tokens_seen": 42484335, "step": 1961, "time_per_iteration": 4.67006778717041 }, { "auxiliary_loss_clip": 0.01163817, "auxiliary_loss_mlp": 0.00779643, "balance_loss_clip": 1.05658793, "balance_loss_mlp": 1.00009537, "epoch": 0.11796182173455584, "flos": 22265728790400.0, "grad_norm": 1.71345119244153, "language_loss": 0.92273545, "learning_rate": 3.919384462576049e-06, "loss": 0.94217002, "num_input_tokens_seen": 42502720, "step": 1962, "time_per_iteration": 2.6559524536132812 }, { "auxiliary_loss_clip": 0.01139826, "auxiliary_loss_mlp": 0.01058964, "balance_loss_clip": 1.05222392, "balance_loss_mlp": 1.03704107, "epoch": 0.1180219449872238, "flos": 10635017892480.0, "grad_norm": 2.157203116008796, "language_loss": 0.87635934, "learning_rate": 3.919274966788707e-06, "loss": 0.8983472, "num_input_tokens_seen": 42519460, "step": 1963, "time_per_iteration": 2.710042715072632 }, { "auxiliary_loss_clip": 0.0115823, "auxiliary_loss_mlp": 0.00779391, "balance_loss_clip": 1.05600929, "balance_loss_mlp": 1.00011134, "epoch": 0.11808206823989177, "flos": 20923532259840.0, "grad_norm": 2.8331529324994333, "language_loss": 0.83879703, "learning_rate": 3.919165398222265e-06, "loss": 0.85817325, "num_input_tokens_seen": 42539420, "step": 1964, "time_per_iteration": 2.734941244125366 }, { "auxiliary_loss_clip": 0.01122529, "auxiliary_loss_mlp": 0.01069054, "balance_loss_clip": 1.05171156, "balance_loss_mlp": 1.04628491, "epoch": 0.11814219149255975, "flos": 20777770869120.0, "grad_norm": 3.9132941826799543, "language_loss": 0.8313272, "learning_rate": 3.919055756880879e-06, "loss": 0.85324299, "num_input_tokens_seen": 42558225, "step": 1965, "time_per_iteration": 2.7427306175231934 }, { "auxiliary_loss_clip": 0.01178673, "auxiliary_loss_mlp": 0.01053338, "balance_loss_clip": 1.05815279, "balance_loss_mlp": 1.03163004, "epoch": 0.11820231474522772, "flos": 48759938542080.0, "grad_norm": 1.6720023918141877, "language_loss": 0.74227381, "learning_rate": 3.918946042768707e-06, "loss": 0.76459396, "num_input_tokens_seen": 42580790, "step": 1966, "time_per_iteration": 2.8265397548675537 }, { "auxiliary_loss_clip": 0.01163407, "auxiliary_loss_mlp": 0.0106081, "balance_loss_clip": 1.06309748, "balance_loss_mlp": 1.03836274, "epoch": 0.11826243799789568, "flos": 16690598012160.0, "grad_norm": 2.5628488285375397, "language_loss": 0.73137337, "learning_rate": 3.918836255889908e-06, "loss": 0.7536155, "num_input_tokens_seen": 42597355, "step": 1967, "time_per_iteration": 2.706193685531616 }, { "auxiliary_loss_clip": 0.01167052, "auxiliary_loss_mlp": 0.01053471, "balance_loss_clip": 1.05852592, "balance_loss_mlp": 1.03141701, "epoch": 0.11832256125056366, "flos": 16909868586240.0, "grad_norm": 5.332816815546028, "language_loss": 0.8831054, "learning_rate": 3.9187263962486456e-06, "loss": 0.90531063, "num_input_tokens_seen": 42616060, "step": 1968, "time_per_iteration": 2.6308343410491943 }, { "auxiliary_loss_clip": 0.01168356, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.06406927, "balance_loss_mlp": 1.0294776, "epoch": 0.11838268450323162, "flos": 22820405587200.0, "grad_norm": 2.252087054693662, "language_loss": 0.67010254, "learning_rate": 3.918616463849087e-06, "loss": 0.69230425, "num_input_tokens_seen": 42636285, "step": 1969, "time_per_iteration": 2.662480592727661 }, { "auxiliary_loss_clip": 0.01130071, "auxiliary_loss_mlp": 0.0106143, "balance_loss_clip": 1.05177045, "balance_loss_mlp": 1.03774357, "epoch": 0.11844280775589959, "flos": 33545844990720.0, "grad_norm": 2.153814675458072, "language_loss": 0.80455101, "learning_rate": 3.918506458695399e-06, "loss": 0.82646602, "num_input_tokens_seen": 42658320, "step": 1970, "time_per_iteration": 2.798050880432129 }, { "auxiliary_loss_clip": 0.01060284, "auxiliary_loss_mlp": 0.01021383, "balance_loss_clip": 1.02553701, "balance_loss_mlp": 1.01892686, "epoch": 0.11850293100856757, "flos": 66350998604160.0, "grad_norm": 0.8165911228106061, "language_loss": 0.66192186, "learning_rate": 3.918396380791754e-06, "loss": 0.68273854, "num_input_tokens_seen": 42721500, "step": 1971, "time_per_iteration": 3.167018413543701 }, { "auxiliary_loss_clip": 0.01151504, "auxiliary_loss_mlp": 0.0105629, "balance_loss_clip": 1.05294323, "balance_loss_mlp": 1.03422379, "epoch": 0.11856305426123553, "flos": 24681045070080.0, "grad_norm": 2.1839859106137554, "language_loss": 0.79782552, "learning_rate": 3.918286230142327e-06, "loss": 0.81990343, "num_input_tokens_seen": 42739825, "step": 1972, "time_per_iteration": 2.6908793449401855 }, { "auxiliary_loss_clip": 0.01133219, "auxiliary_loss_mlp": 0.00778766, "balance_loss_clip": 1.05341005, "balance_loss_mlp": 1.00005877, "epoch": 0.1186231775139035, "flos": 24280102483200.0, "grad_norm": 2.0473813607633384, "language_loss": 0.72843599, "learning_rate": 3.918176006751292e-06, "loss": 0.74755585, "num_input_tokens_seen": 42758695, "step": 1973, "time_per_iteration": 2.7801859378814697 }, { "auxiliary_loss_clip": 0.01138022, "auxiliary_loss_mlp": 0.01049764, "balance_loss_clip": 1.05580497, "balance_loss_mlp": 1.02707887, "epoch": 0.11868330076657148, "flos": 21757413473280.0, "grad_norm": 1.6449677647733996, "language_loss": 0.72019619, "learning_rate": 3.918065710622832e-06, "loss": 0.74207413, "num_input_tokens_seen": 42778510, "step": 1974, "time_per_iteration": 2.7337663173675537 }, { "auxiliary_loss_clip": 0.01129602, "auxiliary_loss_mlp": 0.01043161, "balance_loss_clip": 1.05265522, "balance_loss_mlp": 1.02086854, "epoch": 0.11874342401923944, "flos": 17193274894080.0, "grad_norm": 2.017372400194955, "language_loss": 0.77409399, "learning_rate": 3.917955341761128e-06, "loss": 0.79582161, "num_input_tokens_seen": 42793995, "step": 1975, "time_per_iteration": 2.669546604156494 }, { "auxiliary_loss_clip": 0.01131477, "auxiliary_loss_mlp": 0.01059968, "balance_loss_clip": 1.05880177, "balance_loss_mlp": 1.03908277, "epoch": 0.11880354727190741, "flos": 15229572312960.0, "grad_norm": 2.3842578575289, "language_loss": 0.75110453, "learning_rate": 3.917844900170364e-06, "loss": 0.77301902, "num_input_tokens_seen": 42809000, "step": 1976, "time_per_iteration": 2.8439090251922607 }, { "auxiliary_loss_clip": 0.0116819, "auxiliary_loss_mlp": 0.01049523, "balance_loss_clip": 1.05999744, "balance_loss_mlp": 1.02835166, "epoch": 0.11886367052457537, "flos": 27309706179840.0, "grad_norm": 1.8674311015318124, "language_loss": 0.74877423, "learning_rate": 3.91773438585473e-06, "loss": 0.77095133, "num_input_tokens_seen": 42831585, "step": 1977, "time_per_iteration": 2.6747169494628906 }, { "auxiliary_loss_clip": 0.01182095, "auxiliary_loss_mlp": 0.01059621, "balance_loss_clip": 1.05954552, "balance_loss_mlp": 1.03805614, "epoch": 0.11892379377724335, "flos": 21798280172160.0, "grad_norm": 2.1793079873879604, "language_loss": 0.74207634, "learning_rate": 3.9176237988184165e-06, "loss": 0.76449353, "num_input_tokens_seen": 42848420, "step": 1978, "time_per_iteration": 2.631664514541626 }, { "auxiliary_loss_clip": 0.01142323, "auxiliary_loss_mlp": 0.01050585, "balance_loss_clip": 1.06037045, "balance_loss_mlp": 1.0289247, "epoch": 0.11898391702991132, "flos": 13991013498240.0, "grad_norm": 1.7170872786869797, "language_loss": 0.73256385, "learning_rate": 3.917513139065616e-06, "loss": 0.754493, "num_input_tokens_seen": 42866645, "step": 1979, "time_per_iteration": 2.7442541122436523 }, { "auxiliary_loss_clip": 0.01137516, "auxiliary_loss_mlp": 0.01051378, "balance_loss_clip": 1.0566175, "balance_loss_mlp": 1.02968168, "epoch": 0.11904404028257928, "flos": 32234567091840.0, "grad_norm": 1.876224505386343, "language_loss": 0.98293436, "learning_rate": 3.917402406600525e-06, "loss": 1.00482333, "num_input_tokens_seen": 42888515, "step": 1980, "time_per_iteration": 2.787667989730835 }, { "auxiliary_loss_clip": 0.01153629, "auxiliary_loss_mlp": 0.01053612, "balance_loss_clip": 1.05595791, "balance_loss_mlp": 1.03077161, "epoch": 0.11910416353524726, "flos": 23586272398080.0, "grad_norm": 1.7507584506289393, "language_loss": 0.86265099, "learning_rate": 3.917291601427342e-06, "loss": 0.88472342, "num_input_tokens_seen": 42909035, "step": 1981, "time_per_iteration": 2.6680359840393066 }, { "auxiliary_loss_clip": 0.01158736, "auxiliary_loss_mlp": 0.01064978, "balance_loss_clip": 1.06144083, "balance_loss_mlp": 1.04214907, "epoch": 0.11916428678791523, "flos": 25333038789120.0, "grad_norm": 1.8908045276276995, "language_loss": 0.85375237, "learning_rate": 3.91718072355027e-06, "loss": 0.87598956, "num_input_tokens_seen": 42927555, "step": 1982, "time_per_iteration": 2.732797861099243 }, { "auxiliary_loss_clip": 0.01146432, "auxiliary_loss_mlp": 0.01050259, "balance_loss_clip": 1.05539966, "balance_loss_mlp": 1.02843213, "epoch": 0.11922441004058319, "flos": 19788431592960.0, "grad_norm": 2.3856086229742877, "language_loss": 0.85202634, "learning_rate": 3.917069772973513e-06, "loss": 0.87399322, "num_input_tokens_seen": 42945300, "step": 1983, "time_per_iteration": 2.6839804649353027 }, { "auxiliary_loss_clip": 0.01126589, "auxiliary_loss_mlp": 0.01056051, "balance_loss_clip": 1.05602145, "balance_loss_mlp": 1.03399742, "epoch": 0.11928453329325117, "flos": 21536347219200.0, "grad_norm": 3.6641824085676022, "language_loss": 0.7693429, "learning_rate": 3.916958749701277e-06, "loss": 0.79116929, "num_input_tokens_seen": 42961295, "step": 1984, "time_per_iteration": 2.7008767127990723 }, { "auxiliary_loss_clip": 0.01161623, "auxiliary_loss_mlp": 0.01055251, "balance_loss_clip": 1.05752373, "balance_loss_mlp": 1.0334003, "epoch": 0.11934465654591914, "flos": 20815010294400.0, "grad_norm": 1.917528093726237, "language_loss": 0.83058321, "learning_rate": 3.9168476537377745e-06, "loss": 0.85275191, "num_input_tokens_seen": 42980330, "step": 1985, "time_per_iteration": 2.6692728996276855 }, { "auxiliary_loss_clip": 0.01151831, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.0541923, "balance_loss_mlp": 1.02835393, "epoch": 0.1194047797985871, "flos": 19060486565760.0, "grad_norm": 1.8732848573733223, "language_loss": 0.74398553, "learning_rate": 3.916736485087216e-06, "loss": 0.76600474, "num_input_tokens_seen": 42996125, "step": 1986, "time_per_iteration": 2.722013473510742 }, { "auxiliary_loss_clip": 0.01146125, "auxiliary_loss_mlp": 0.01059008, "balance_loss_clip": 1.05472732, "balance_loss_mlp": 1.03791952, "epoch": 0.11946490305125507, "flos": 27190805184000.0, "grad_norm": 2.4724436343771083, "language_loss": 0.72123617, "learning_rate": 3.916625243753819e-06, "loss": 0.74328756, "num_input_tokens_seen": 43014180, "step": 1987, "time_per_iteration": 2.814481258392334 }, { "auxiliary_loss_clip": 0.01156854, "auxiliary_loss_mlp": 0.01054644, "balance_loss_clip": 1.05747938, "balance_loss_mlp": 1.03138638, "epoch": 0.11952502630392305, "flos": 21140791672320.0, "grad_norm": 1.9246234449532542, "language_loss": 0.72007513, "learning_rate": 3.916513929741799e-06, "loss": 0.74219012, "num_input_tokens_seen": 43032120, "step": 1988, "time_per_iteration": 2.7242019176483154 }, { "auxiliary_loss_clip": 0.0116348, "auxiliary_loss_mlp": 0.01062102, "balance_loss_clip": 1.05559146, "balance_loss_mlp": 1.03913057, "epoch": 0.11958514955659101, "flos": 22124241118080.0, "grad_norm": 1.7561483239324645, "language_loss": 0.81144297, "learning_rate": 3.91640254305538e-06, "loss": 0.83369875, "num_input_tokens_seen": 43052215, "step": 1989, "time_per_iteration": 2.6259546279907227 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01057689, "balance_loss_clip": 1.05254042, "balance_loss_mlp": 1.03325129, "epoch": 0.11964527280925898, "flos": 17421452040960.0, "grad_norm": 2.5516320258539795, "language_loss": 0.75881672, "learning_rate": 3.916291083698784e-06, "loss": 0.7807532, "num_input_tokens_seen": 43069720, "step": 1990, "time_per_iteration": 2.6779251098632812 }, { "auxiliary_loss_clip": 0.0105322, "auxiliary_loss_mlp": 0.01019112, "balance_loss_clip": 1.02816892, "balance_loss_mlp": 1.01647794, "epoch": 0.11970539606192696, "flos": 70679741402880.0, "grad_norm": 0.8628582727639288, "language_loss": 0.55184531, "learning_rate": 3.916179551676238e-06, "loss": 0.57256866, "num_input_tokens_seen": 43123130, "step": 1991, "time_per_iteration": 3.3713693618774414 }, { "auxiliary_loss_clip": 0.01136423, "auxiliary_loss_mlp": 0.01053959, "balance_loss_clip": 1.05748868, "balance_loss_mlp": 1.03326464, "epoch": 0.11976551931459492, "flos": 21215019127680.0, "grad_norm": 2.286300891386994, "language_loss": 0.78371406, "learning_rate": 3.916067946991971e-06, "loss": 0.80561793, "num_input_tokens_seen": 43140015, "step": 1992, "time_per_iteration": 2.6797914505004883 }, { "auxiliary_loss_clip": 0.0117949, "auxiliary_loss_mlp": 0.01056635, "balance_loss_clip": 1.05811, "balance_loss_mlp": 1.03453374, "epoch": 0.11982564256726289, "flos": 25989306226560.0, "grad_norm": 1.8481811043026504, "language_loss": 0.78911144, "learning_rate": 3.915956269650216e-06, "loss": 0.81147265, "num_input_tokens_seen": 43160105, "step": 1993, "time_per_iteration": 2.691301107406616 }, { "auxiliary_loss_clip": 0.01126423, "auxiliary_loss_mlp": 0.0106217, "balance_loss_clip": 1.05012226, "balance_loss_mlp": 1.04081941, "epoch": 0.11988576581993086, "flos": 21650866755840.0, "grad_norm": 1.644866568705103, "language_loss": 0.82088816, "learning_rate": 3.915844519655208e-06, "loss": 0.84277415, "num_input_tokens_seen": 43179835, "step": 1994, "time_per_iteration": 2.772905111312866 }, { "auxiliary_loss_clip": 0.0115068, "auxiliary_loss_mlp": 0.01063961, "balance_loss_clip": 1.05523098, "balance_loss_mlp": 1.0433259, "epoch": 0.11994588907259883, "flos": 17857407409920.0, "grad_norm": 2.0065598513575247, "language_loss": 0.88392794, "learning_rate": 3.915732697011183e-06, "loss": 0.9060744, "num_input_tokens_seen": 43197210, "step": 1995, "time_per_iteration": 4.206532716751099 }, { "auxiliary_loss_clip": 0.01153482, "auxiliary_loss_mlp": 0.01066415, "balance_loss_clip": 1.06005812, "balance_loss_mlp": 1.0441823, "epoch": 0.1200060123252668, "flos": 24462744163200.0, "grad_norm": 1.8775058007239456, "language_loss": 0.73949909, "learning_rate": 3.9156208017223825e-06, "loss": 0.76169801, "num_input_tokens_seen": 43215050, "step": 1996, "time_per_iteration": 2.7263944149017334 }, { "auxiliary_loss_clip": 0.01141484, "auxiliary_loss_mlp": 0.01060112, "balance_loss_clip": 1.05754757, "balance_loss_mlp": 1.03808212, "epoch": 0.12006613557793476, "flos": 18732191235840.0, "grad_norm": 1.976051865072764, "language_loss": 0.88125587, "learning_rate": 3.915508833793048e-06, "loss": 0.90327179, "num_input_tokens_seen": 43233900, "step": 1997, "time_per_iteration": 4.29426383972168 }, { "auxiliary_loss_clip": 0.01165634, "auxiliary_loss_mlp": 0.00779568, "balance_loss_clip": 1.05701697, "balance_loss_mlp": 1.00001049, "epoch": 0.12012625883060274, "flos": 22267739952000.0, "grad_norm": 2.1091392562336018, "language_loss": 0.79031086, "learning_rate": 3.915396793227428e-06, "loss": 0.80976284, "num_input_tokens_seen": 43252105, "step": 1998, "time_per_iteration": 4.330955266952515 }, { "auxiliary_loss_clip": 0.0116661, "auxiliary_loss_mlp": 0.00779642, "balance_loss_clip": 1.0576719, "balance_loss_mlp": 1.00002396, "epoch": 0.1201863820832707, "flos": 21758885930880.0, "grad_norm": 1.799585336659533, "language_loss": 0.73583078, "learning_rate": 3.915284680029769e-06, "loss": 0.75529337, "num_input_tokens_seen": 43270315, "step": 1999, "time_per_iteration": 2.754770040512085 }, { "auxiliary_loss_clip": 0.01178966, "auxiliary_loss_mlp": 0.01073097, "balance_loss_clip": 1.0602119, "balance_loss_mlp": 1.05115068, "epoch": 0.12024650533593867, "flos": 21907987286400.0, "grad_norm": 2.916355473014409, "language_loss": 0.74854898, "learning_rate": 3.915172494204323e-06, "loss": 0.77106953, "num_input_tokens_seen": 43289935, "step": 2000, "time_per_iteration": 4.3900322914123535 }, { "auxiliary_loss_clip": 0.01149374, "auxiliary_loss_mlp": 0.01069735, "balance_loss_clip": 1.05375695, "balance_loss_mlp": 1.04763341, "epoch": 0.12030662858860665, "flos": 21689219502720.0, "grad_norm": 1.5203973891597686, "language_loss": 0.8496564, "learning_rate": 3.915060235755344e-06, "loss": 0.87184751, "num_input_tokens_seen": 43309325, "step": 2001, "time_per_iteration": 2.6912643909454346 }, { "auxiliary_loss_clip": 0.01154057, "auxiliary_loss_mlp": 0.01063637, "balance_loss_clip": 1.05600786, "balance_loss_mlp": 1.04265642, "epoch": 0.12036675184127461, "flos": 12933228856320.0, "grad_norm": 2.932264271186656, "language_loss": 0.74711967, "learning_rate": 3.91494790468709e-06, "loss": 0.76929653, "num_input_tokens_seen": 43327010, "step": 2002, "time_per_iteration": 2.6991024017333984 }, { "auxiliary_loss_clip": 0.01129169, "auxiliary_loss_mlp": 0.01066705, "balance_loss_clip": 1.05340302, "balance_loss_mlp": 1.0429939, "epoch": 0.12042687509394258, "flos": 20851028657280.0, "grad_norm": 2.117271428042382, "language_loss": 0.78029454, "learning_rate": 3.9148355010038185e-06, "loss": 0.80225325, "num_input_tokens_seen": 43345650, "step": 2003, "time_per_iteration": 2.731381416320801 }, { "auxiliary_loss_clip": 0.01163252, "auxiliary_loss_mlp": 0.01062886, "balance_loss_clip": 1.05728662, "balance_loss_mlp": 1.04073668, "epoch": 0.12048699834661056, "flos": 23878513451520.0, "grad_norm": 1.585850552088038, "language_loss": 0.72205627, "learning_rate": 3.914723024709793e-06, "loss": 0.74431765, "num_input_tokens_seen": 43365555, "step": 2004, "time_per_iteration": 2.725092649459839 }, { "auxiliary_loss_clip": 0.01160616, "auxiliary_loss_mlp": 0.01069457, "balance_loss_clip": 1.05870187, "balance_loss_mlp": 1.04645014, "epoch": 0.12054712159927852, "flos": 19756363726080.0, "grad_norm": 1.9357732467170252, "language_loss": 0.78415942, "learning_rate": 3.914610475809279e-06, "loss": 0.8064602, "num_input_tokens_seen": 43384990, "step": 2005, "time_per_iteration": 2.7232437133789062 }, { "auxiliary_loss_clip": 0.01073016, "auxiliary_loss_mlp": 0.00758901, "balance_loss_clip": 1.02995479, "balance_loss_mlp": 1.00011683, "epoch": 0.12060724485194649, "flos": 51672763123200.0, "grad_norm": 0.9264315537536937, "language_loss": 0.58087146, "learning_rate": 3.914497854306543e-06, "loss": 0.59919059, "num_input_tokens_seen": 43436335, "step": 2006, "time_per_iteration": 2.9570157527923584 }, { "auxiliary_loss_clip": 0.01155081, "auxiliary_loss_mlp": 0.01053472, "balance_loss_clip": 1.05803597, "balance_loss_mlp": 1.03299201, "epoch": 0.12066736810461445, "flos": 18990425088000.0, "grad_norm": 1.6109316320484448, "language_loss": 0.76524282, "learning_rate": 3.9143851602058575e-06, "loss": 0.78732836, "num_input_tokens_seen": 43456495, "step": 2007, "time_per_iteration": 2.763380289077759 }, { "auxiliary_loss_clip": 0.01147254, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.05931091, "balance_loss_mlp": 1.04177368, "epoch": 0.12072749135728243, "flos": 16471973882880.0, "grad_norm": 2.449779851562752, "language_loss": 0.83023942, "learning_rate": 3.914272393511494e-06, "loss": 0.85235405, "num_input_tokens_seen": 43473085, "step": 2008, "time_per_iteration": 2.7693119049072266 }, { "auxiliary_loss_clip": 0.01176157, "auxiliary_loss_mlp": 0.01052894, "balance_loss_clip": 1.0584172, "balance_loss_mlp": 1.03135288, "epoch": 0.1207876146099504, "flos": 18077108947200.0, "grad_norm": 2.203355340521787, "language_loss": 0.83835697, "learning_rate": 3.91415955422773e-06, "loss": 0.86064744, "num_input_tokens_seen": 43491135, "step": 2009, "time_per_iteration": 2.640944242477417 }, { "auxiliary_loss_clip": 0.01180076, "auxiliary_loss_mlp": 0.01053549, "balance_loss_clip": 1.06196725, "balance_loss_mlp": 1.02994514, "epoch": 0.12084773786261836, "flos": 21871573873920.0, "grad_norm": 1.6799099601218046, "language_loss": 0.83870012, "learning_rate": 3.914046642358844e-06, "loss": 0.8610363, "num_input_tokens_seen": 43510440, "step": 2010, "time_per_iteration": 2.716127634048462 }, { "auxiliary_loss_clip": 0.01145261, "auxiliary_loss_mlp": 0.00780804, "balance_loss_clip": 1.05555713, "balance_loss_mlp": 1.0000627, "epoch": 0.12090786111528634, "flos": 18333044328960.0, "grad_norm": 1.8933604390076018, "language_loss": 0.84194541, "learning_rate": 3.9139336579091174e-06, "loss": 0.86120605, "num_input_tokens_seen": 43530145, "step": 2011, "time_per_iteration": 2.73793625831604 }, { "auxiliary_loss_clip": 0.01148418, "auxiliary_loss_mlp": 0.01060974, "balance_loss_clip": 1.05480969, "balance_loss_mlp": 1.03905129, "epoch": 0.1209679843679543, "flos": 21105850717440.0, "grad_norm": 2.0524904800028154, "language_loss": 0.96236968, "learning_rate": 3.913820600882834e-06, "loss": 0.98446357, "num_input_tokens_seen": 43549315, "step": 2012, "time_per_iteration": 2.7269980907440186 }, { "auxiliary_loss_clip": 0.01146369, "auxiliary_loss_mlp": 0.01051396, "balance_loss_clip": 1.05808425, "balance_loss_mlp": 1.0289607, "epoch": 0.12102810762062227, "flos": 29241053585280.0, "grad_norm": 1.853151366811655, "language_loss": 0.80903435, "learning_rate": 3.913707471284283e-06, "loss": 0.83101201, "num_input_tokens_seen": 43569240, "step": 2013, "time_per_iteration": 2.740489959716797 }, { "auxiliary_loss_clip": 0.01124703, "auxiliary_loss_mlp": 0.0105341, "balance_loss_clip": 1.05300117, "balance_loss_mlp": 1.02962804, "epoch": 0.12108823087329025, "flos": 17930701111680.0, "grad_norm": 5.099975898232357, "language_loss": 0.77255923, "learning_rate": 3.9135942691177515e-06, "loss": 0.79434031, "num_input_tokens_seen": 43587710, "step": 2014, "time_per_iteration": 2.7361485958099365 }, { "auxiliary_loss_clip": 0.0116607, "auxiliary_loss_mlp": 0.01051056, "balance_loss_clip": 1.05832791, "balance_loss_mlp": 1.02791715, "epoch": 0.12114835412595822, "flos": 22091850028800.0, "grad_norm": 5.8570343294144465, "language_loss": 0.87169874, "learning_rate": 3.913480994387535e-06, "loss": 0.89387, "num_input_tokens_seen": 43606000, "step": 2015, "time_per_iteration": 2.6881515979766846 }, { "auxiliary_loss_clip": 0.01170382, "auxiliary_loss_mlp": 0.01051162, "balance_loss_clip": 1.05500197, "balance_loss_mlp": 1.0289886, "epoch": 0.12120847737862618, "flos": 20412343854720.0, "grad_norm": 2.087765239068409, "language_loss": 0.69146478, "learning_rate": 3.913367647097926e-06, "loss": 0.71368027, "num_input_tokens_seen": 43624815, "step": 2016, "time_per_iteration": 2.7096211910247803 }, { "auxiliary_loss_clip": 0.01152563, "auxiliary_loss_mlp": 0.0104714, "balance_loss_clip": 1.05737591, "balance_loss_mlp": 1.02390599, "epoch": 0.12126860063129415, "flos": 22309037614080.0, "grad_norm": 2.8043603396252865, "language_loss": 0.79858959, "learning_rate": 3.913254227253225e-06, "loss": 0.82058656, "num_input_tokens_seen": 43643960, "step": 2017, "time_per_iteration": 2.7042336463928223 }, { "auxiliary_loss_clip": 0.01156022, "auxiliary_loss_mlp": 0.0105052, "balance_loss_clip": 1.05479789, "balance_loss_mlp": 1.02740538, "epoch": 0.12132872388396213, "flos": 13699275235200.0, "grad_norm": 2.8700241463026654, "language_loss": 0.68828821, "learning_rate": 3.913140734857731e-06, "loss": 0.71035373, "num_input_tokens_seen": 43662650, "step": 2018, "time_per_iteration": 2.7015058994293213 }, { "auxiliary_loss_clip": 0.01136376, "auxiliary_loss_mlp": 0.01050749, "balance_loss_clip": 1.05524123, "balance_loss_mlp": 1.02873111, "epoch": 0.12138884713663009, "flos": 26466954307200.0, "grad_norm": 1.6132330771570709, "language_loss": 0.72476816, "learning_rate": 3.91302716991575e-06, "loss": 0.74663943, "num_input_tokens_seen": 43684205, "step": 2019, "time_per_iteration": 2.8956947326660156 }, { "auxiliary_loss_clip": 0.01107167, "auxiliary_loss_mlp": 0.01057916, "balance_loss_clip": 1.05286384, "balance_loss_mlp": 1.03482556, "epoch": 0.12144897038929806, "flos": 26141603892480.0, "grad_norm": 1.853626515444831, "language_loss": 0.92125106, "learning_rate": 3.912913532431586e-06, "loss": 0.94290185, "num_input_tokens_seen": 43706320, "step": 2020, "time_per_iteration": 2.9980764389038086 }, { "auxiliary_loss_clip": 0.0114145, "auxiliary_loss_mlp": 0.01055455, "balance_loss_clip": 1.05289125, "balance_loss_mlp": 1.03360391, "epoch": 0.12150909364196603, "flos": 24717530309760.0, "grad_norm": 1.9227427415613194, "language_loss": 0.7772885, "learning_rate": 3.912799822409549e-06, "loss": 0.79925752, "num_input_tokens_seen": 43724805, "step": 2021, "time_per_iteration": 3.01798939704895 }, { "auxiliary_loss_clip": 0.0117749, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.0610733, "balance_loss_mlp": 1.0277164, "epoch": 0.121569216894634, "flos": 25186990089600.0, "grad_norm": 2.054228820960504, "language_loss": 0.80712306, "learning_rate": 3.912686039853952e-06, "loss": 0.82938808, "num_input_tokens_seen": 43742320, "step": 2022, "time_per_iteration": 2.684309244155884 }, { "auxiliary_loss_clip": 0.01144749, "auxiliary_loss_mlp": 0.0106163, "balance_loss_clip": 1.055619, "balance_loss_mlp": 1.03697765, "epoch": 0.12162934014730196, "flos": 13444094039040.0, "grad_norm": 1.734031517866852, "language_loss": 0.84842217, "learning_rate": 3.912572184769108e-06, "loss": 0.87048596, "num_input_tokens_seen": 43760665, "step": 2023, "time_per_iteration": 2.6886441707611084 }, { "auxiliary_loss_clip": 0.01139348, "auxiliary_loss_mlp": 0.01053043, "balance_loss_clip": 1.05162323, "balance_loss_mlp": 1.03081048, "epoch": 0.12168946339996994, "flos": 16946138344320.0, "grad_norm": 2.3397199529221546, "language_loss": 0.85514021, "learning_rate": 3.912458257159335e-06, "loss": 0.87706411, "num_input_tokens_seen": 43779020, "step": 2024, "time_per_iteration": 2.8043718338012695 }, { "auxiliary_loss_clip": 0.01169767, "auxiliary_loss_mlp": 0.01055534, "balance_loss_clip": 1.05277538, "balance_loss_mlp": 1.03389716, "epoch": 0.12174958665263791, "flos": 29821585196160.0, "grad_norm": 1.8432491304976684, "language_loss": 0.72088945, "learning_rate": 3.912344257028954e-06, "loss": 0.74314243, "num_input_tokens_seen": 43798850, "step": 2025, "time_per_iteration": 2.704876184463501 }, { "auxiliary_loss_clip": 0.01148564, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.05486572, "balance_loss_mlp": 1.02555275, "epoch": 0.12180970990530587, "flos": 24641902224000.0, "grad_norm": 1.4969552271445652, "language_loss": 0.76075011, "learning_rate": 3.912230184382286e-06, "loss": 0.78271192, "num_input_tokens_seen": 43820130, "step": 2026, "time_per_iteration": 2.6957921981811523 }, { "auxiliary_loss_clip": 0.01147374, "auxiliary_loss_mlp": 0.01046261, "balance_loss_clip": 1.05086374, "balance_loss_mlp": 1.02474427, "epoch": 0.12186983315797385, "flos": 20521691832960.0, "grad_norm": 2.2064263994277478, "language_loss": 0.88769746, "learning_rate": 3.912116039223659e-06, "loss": 0.90963376, "num_input_tokens_seen": 43838485, "step": 2027, "time_per_iteration": 2.6847639083862305 }, { "auxiliary_loss_clip": 0.01143778, "auxiliary_loss_mlp": 0.01056715, "balance_loss_clip": 1.05258501, "balance_loss_mlp": 1.03667617, "epoch": 0.12192995641064182, "flos": 27818344719360.0, "grad_norm": 1.5725885574076592, "language_loss": 0.75544459, "learning_rate": 3.912001821557399e-06, "loss": 0.77744961, "num_input_tokens_seen": 43859080, "step": 2028, "time_per_iteration": 2.7706027030944824 }, { "auxiliary_loss_clip": 0.01123185, "auxiliary_loss_mlp": 0.01057136, "balance_loss_clip": 1.0518471, "balance_loss_mlp": 1.03554714, "epoch": 0.12199007966330978, "flos": 22017119783040.0, "grad_norm": 2.0550419223931193, "language_loss": 0.76802504, "learning_rate": 3.911887531387839e-06, "loss": 0.78982824, "num_input_tokens_seen": 43879030, "step": 2029, "time_per_iteration": 2.732637405395508 }, { "auxiliary_loss_clip": 0.01156591, "auxiliary_loss_mlp": 0.01052355, "balance_loss_clip": 1.05253625, "balance_loss_mlp": 1.03107572, "epoch": 0.12205020291597775, "flos": 23295216493440.0, "grad_norm": 1.707195979328818, "language_loss": 0.79164296, "learning_rate": 3.911773168719313e-06, "loss": 0.81373239, "num_input_tokens_seen": 43898505, "step": 2030, "time_per_iteration": 2.7254061698913574 }, { "auxiliary_loss_clip": 0.0116997, "auxiliary_loss_mlp": 0.01051357, "balance_loss_clip": 1.05618095, "balance_loss_mlp": 1.02930319, "epoch": 0.12211032616864573, "flos": 26031609469440.0, "grad_norm": 3.038077546298312, "language_loss": 0.74411637, "learning_rate": 3.911658733556155e-06, "loss": 0.76632965, "num_input_tokens_seen": 43917945, "step": 2031, "time_per_iteration": 2.6711080074310303 }, { "auxiliary_loss_clip": 0.01174332, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.05888343, "balance_loss_mlp": 1.02545118, "epoch": 0.12217044942131369, "flos": 20410943224320.0, "grad_norm": 1.7636188348969384, "language_loss": 0.75230348, "learning_rate": 3.911544225902707e-06, "loss": 0.7745049, "num_input_tokens_seen": 43937385, "step": 2032, "time_per_iteration": 2.7134530544281006 }, { "auxiliary_loss_clip": 0.01152363, "auxiliary_loss_mlp": 0.01045735, "balance_loss_clip": 1.05129802, "balance_loss_mlp": 1.02538586, "epoch": 0.12223057267398166, "flos": 22857142222080.0, "grad_norm": 1.5809359138264147, "language_loss": 0.89502287, "learning_rate": 3.911429645763311e-06, "loss": 0.91700387, "num_input_tokens_seen": 43958130, "step": 2033, "time_per_iteration": 2.7105965614318848 }, { "auxiliary_loss_clip": 0.01155694, "auxiliary_loss_mlp": 0.01051169, "balance_loss_clip": 1.05740523, "balance_loss_mlp": 1.03005767, "epoch": 0.12229069592664964, "flos": 20047563285120.0, "grad_norm": 1.9580868921695649, "language_loss": 0.65195286, "learning_rate": 3.911314993142311e-06, "loss": 0.67402148, "num_input_tokens_seen": 43976800, "step": 2034, "time_per_iteration": 4.222668886184692 }, { "auxiliary_loss_clip": 0.01152239, "auxiliary_loss_mlp": 0.01055659, "balance_loss_clip": 1.05550218, "balance_loss_mlp": 1.0327704, "epoch": 0.1223508191793176, "flos": 22274240313600.0, "grad_norm": 1.6376942269871653, "language_loss": 0.76459455, "learning_rate": 3.911200268044055e-06, "loss": 0.78667355, "num_input_tokens_seen": 43996620, "step": 2035, "time_per_iteration": 2.7306556701660156 }, { "auxiliary_loss_clip": 0.01176703, "auxiliary_loss_mlp": 0.01050008, "balance_loss_clip": 1.0577215, "balance_loss_mlp": 1.02798975, "epoch": 0.12241094243198557, "flos": 21285978445440.0, "grad_norm": 1.8460180606974623, "language_loss": 0.71294892, "learning_rate": 3.911085470472892e-06, "loss": 0.73521602, "num_input_tokens_seen": 44016175, "step": 2036, "time_per_iteration": 2.7327258586883545 }, { "auxiliary_loss_clip": 0.01144473, "auxiliary_loss_mlp": 0.01058389, "balance_loss_clip": 1.05778408, "balance_loss_mlp": 1.03623962, "epoch": 0.12247106568465355, "flos": 17382381022080.0, "grad_norm": 1.5772021569883852, "language_loss": 0.83130831, "learning_rate": 3.910970600433178e-06, "loss": 0.85333693, "num_input_tokens_seen": 44035060, "step": 2037, "time_per_iteration": 4.248440742492676 }, { "auxiliary_loss_clip": 0.01153641, "auxiliary_loss_mlp": 0.01060257, "balance_loss_clip": 1.0556947, "balance_loss_mlp": 1.0366174, "epoch": 0.12253118893732151, "flos": 27045438842880.0, "grad_norm": 2.676780030246967, "language_loss": 0.79765236, "learning_rate": 3.910855657929267e-06, "loss": 0.81979132, "num_input_tokens_seen": 44053330, "step": 2038, "time_per_iteration": 2.7321341037750244 }, { "auxiliary_loss_clip": 0.010642, "auxiliary_loss_mlp": 0.00759248, "balance_loss_clip": 1.02961969, "balance_loss_mlp": 1.00006962, "epoch": 0.12259131218998948, "flos": 53861518368000.0, "grad_norm": 0.8248048644272604, "language_loss": 0.58659601, "learning_rate": 3.910740642965518e-06, "loss": 0.6048305, "num_input_tokens_seen": 44107575, "step": 2039, "time_per_iteration": 4.739040851593018 }, { "auxiliary_loss_clip": 0.01128, "auxiliary_loss_mlp": 0.01064411, "balance_loss_clip": 1.05292714, "balance_loss_mlp": 1.03912663, "epoch": 0.12265143544265744, "flos": 17891917401600.0, "grad_norm": 2.1548467753138136, "language_loss": 0.80099291, "learning_rate": 3.910625555546292e-06, "loss": 0.82291704, "num_input_tokens_seen": 44126075, "step": 2040, "time_per_iteration": 2.723247766494751 }, { "auxiliary_loss_clip": 0.01149343, "auxiliary_loss_mlp": 0.01058534, "balance_loss_clip": 1.05517352, "balance_loss_mlp": 1.03673029, "epoch": 0.12271155869532542, "flos": 21799932197760.0, "grad_norm": 1.8247690225218605, "language_loss": 0.82841176, "learning_rate": 3.910510395675953e-06, "loss": 0.85049051, "num_input_tokens_seen": 44145605, "step": 2041, "time_per_iteration": 2.699110984802246 }, { "auxiliary_loss_clip": 0.01136001, "auxiliary_loss_mlp": 0.01053451, "balance_loss_clip": 1.05120957, "balance_loss_mlp": 1.03061032, "epoch": 0.12277168194799339, "flos": 19828759587840.0, "grad_norm": 1.9386136063873771, "language_loss": 0.67272276, "learning_rate": 3.9103951633588694e-06, "loss": 0.69461727, "num_input_tokens_seen": 44164770, "step": 2042, "time_per_iteration": 2.7042133808135986 }, { "auxiliary_loss_clip": 0.01133115, "auxiliary_loss_mlp": 0.01056941, "balance_loss_clip": 1.05079007, "balance_loss_mlp": 1.03517294, "epoch": 0.12283180520066135, "flos": 23221024951680.0, "grad_norm": 1.912164915278887, "language_loss": 0.81765604, "learning_rate": 3.910279858599409e-06, "loss": 0.83955657, "num_input_tokens_seen": 44184025, "step": 2043, "time_per_iteration": 2.6942050457000732 }, { "auxiliary_loss_clip": 0.01146416, "auxiliary_loss_mlp": 0.01052365, "balance_loss_clip": 1.05161905, "balance_loss_mlp": 1.03040695, "epoch": 0.12289192845332933, "flos": 18588476920320.0, "grad_norm": 1.7894844734354058, "language_loss": 0.80192459, "learning_rate": 3.910164481401946e-06, "loss": 0.82391244, "num_input_tokens_seen": 44202950, "step": 2044, "time_per_iteration": 2.6227192878723145 }, { "auxiliary_loss_clip": 0.01116285, "auxiliary_loss_mlp": 0.01052013, "balance_loss_clip": 1.05284619, "balance_loss_mlp": 1.03055525, "epoch": 0.1229520517059973, "flos": 25769532862080.0, "grad_norm": 1.7152742607840916, "language_loss": 0.7794897, "learning_rate": 3.910049031770853e-06, "loss": 0.80117267, "num_input_tokens_seen": 44221115, "step": 2045, "time_per_iteration": 2.769017219543457 }, { "auxiliary_loss_clip": 0.01163545, "auxiliary_loss_mlp": 0.01060468, "balance_loss_clip": 1.05796146, "balance_loss_mlp": 1.03827095, "epoch": 0.12301217495866526, "flos": 20887154760960.0, "grad_norm": 1.852572781372854, "language_loss": 0.67284262, "learning_rate": 3.90993350971051e-06, "loss": 0.69508278, "num_input_tokens_seen": 44240575, "step": 2046, "time_per_iteration": 2.6377944946289062 }, { "auxiliary_loss_clip": 0.01173803, "auxiliary_loss_mlp": 0.01053755, "balance_loss_clip": 1.06010675, "balance_loss_mlp": 1.03202295, "epoch": 0.12307229821133324, "flos": 22378811783040.0, "grad_norm": 4.982373490718116, "language_loss": 0.72730684, "learning_rate": 3.909817915225297e-06, "loss": 0.74958241, "num_input_tokens_seen": 44257145, "step": 2047, "time_per_iteration": 2.5791239738464355 }, { "auxiliary_loss_clip": 0.01155159, "auxiliary_loss_mlp": 0.01060632, "balance_loss_clip": 1.05398846, "balance_loss_mlp": 1.03817296, "epoch": 0.1231324214640012, "flos": 23367396873600.0, "grad_norm": 1.8194194024321948, "language_loss": 0.76583183, "learning_rate": 3.909702248319597e-06, "loss": 0.78798974, "num_input_tokens_seen": 44278035, "step": 2048, "time_per_iteration": 2.6997592449188232 }, { "auxiliary_loss_clip": 0.01146796, "auxiliary_loss_mlp": 0.01047309, "balance_loss_clip": 1.05524468, "balance_loss_mlp": 1.02798486, "epoch": 0.12319254471666917, "flos": 23767154311680.0, "grad_norm": 1.8097490634569602, "language_loss": 0.85359102, "learning_rate": 3.909586508997797e-06, "loss": 0.87553203, "num_input_tokens_seen": 44296980, "step": 2049, "time_per_iteration": 2.739617109298706 }, { "auxiliary_loss_clip": 0.01120276, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.0533725, "balance_loss_mlp": 1.02887857, "epoch": 0.12325266796933713, "flos": 23550146294400.0, "grad_norm": 2.6582136339172724, "language_loss": 0.75563407, "learning_rate": 3.909470697264285e-06, "loss": 0.77733827, "num_input_tokens_seen": 44318005, "step": 2050, "time_per_iteration": 2.7814078330993652 }, { "auxiliary_loss_clip": 0.01138568, "auxiliary_loss_mlp": 0.01057939, "balance_loss_clip": 1.05428278, "balance_loss_mlp": 1.03608823, "epoch": 0.12331279122200511, "flos": 24423996366720.0, "grad_norm": 1.81408967902731, "language_loss": 0.81166679, "learning_rate": 3.909354813123452e-06, "loss": 0.83363187, "num_input_tokens_seen": 44335260, "step": 2051, "time_per_iteration": 2.7555224895477295 }, { "auxiliary_loss_clip": 0.01171646, "auxiliary_loss_mlp": 0.00779218, "balance_loss_clip": 1.05882978, "balance_loss_mlp": 0.99996465, "epoch": 0.12337291447467308, "flos": 25484294960640.0, "grad_norm": 1.8885516327307212, "language_loss": 0.80445349, "learning_rate": 3.909238856579693e-06, "loss": 0.82396215, "num_input_tokens_seen": 44355315, "step": 2052, "time_per_iteration": 2.7676405906677246 }, { "auxiliary_loss_clip": 0.01165489, "auxiliary_loss_mlp": 0.010569, "balance_loss_clip": 1.0581975, "balance_loss_mlp": 1.03537059, "epoch": 0.12343303772734104, "flos": 23550002640000.0, "grad_norm": 2.171205541070781, "language_loss": 0.73676848, "learning_rate": 3.909122827637406e-06, "loss": 0.75899243, "num_input_tokens_seen": 44373020, "step": 2053, "time_per_iteration": 2.648609161376953 }, { "auxiliary_loss_clip": 0.01168883, "auxiliary_loss_mlp": 0.00778478, "balance_loss_clip": 1.05302441, "balance_loss_mlp": 0.99995315, "epoch": 0.12349316098000902, "flos": 47557074867840.0, "grad_norm": 1.5051513438882418, "language_loss": 0.7413671, "learning_rate": 3.909006726300991e-06, "loss": 0.76084077, "num_input_tokens_seen": 44397525, "step": 2054, "time_per_iteration": 2.871469020843506 }, { "auxiliary_loss_clip": 0.01147607, "auxiliary_loss_mlp": 0.01044612, "balance_loss_clip": 1.05402803, "balance_loss_mlp": 1.02482307, "epoch": 0.12355328423267699, "flos": 25045969294080.0, "grad_norm": 4.50189877271012, "language_loss": 0.85417157, "learning_rate": 3.908890552574849e-06, "loss": 0.8760938, "num_input_tokens_seen": 44415890, "step": 2055, "time_per_iteration": 2.7136077880859375 }, { "auxiliary_loss_clip": 0.01133829, "auxiliary_loss_mlp": 0.01047458, "balance_loss_clip": 1.05999517, "balance_loss_mlp": 1.02802706, "epoch": 0.12361340748534495, "flos": 27709140395520.0, "grad_norm": 2.0629908776416688, "language_loss": 0.77506042, "learning_rate": 3.908774306463384e-06, "loss": 0.79687333, "num_input_tokens_seen": 44436625, "step": 2056, "time_per_iteration": 2.83107852935791 }, { "auxiliary_loss_clip": 0.01158234, "auxiliary_loss_mlp": 0.01055, "balance_loss_clip": 1.05444396, "balance_loss_mlp": 1.03405499, "epoch": 0.12367353073801293, "flos": 26140598311680.0, "grad_norm": 1.9893743253373262, "language_loss": 0.83361745, "learning_rate": 3.908657987971009e-06, "loss": 0.85574985, "num_input_tokens_seen": 44455265, "step": 2057, "time_per_iteration": 2.6987085342407227 }, { "auxiliary_loss_clip": 0.01141319, "auxiliary_loss_mlp": 0.01051708, "balance_loss_clip": 1.05057144, "balance_loss_mlp": 1.02991605, "epoch": 0.1237336539906809, "flos": 25156035544320.0, "grad_norm": 1.4905135493793764, "language_loss": 0.77818203, "learning_rate": 3.90854159710213e-06, "loss": 0.80011231, "num_input_tokens_seen": 44475815, "step": 2058, "time_per_iteration": 2.7149016857147217 }, { "auxiliary_loss_clip": 0.01138087, "auxiliary_loss_mlp": 0.01058134, "balance_loss_clip": 1.05117273, "balance_loss_mlp": 1.03482866, "epoch": 0.12379377724334886, "flos": 15304589867520.0, "grad_norm": 1.8387803476985631, "language_loss": 0.8342883, "learning_rate": 3.9084251338611624e-06, "loss": 0.85625052, "num_input_tokens_seen": 44494045, "step": 2059, "time_per_iteration": 2.7030091285705566 }, { "auxiliary_loss_clip": 0.01133517, "auxiliary_loss_mlp": 0.01057399, "balance_loss_clip": 1.05123472, "balance_loss_mlp": 1.03445077, "epoch": 0.12385390049601683, "flos": 21316717509120.0, "grad_norm": 2.7478129466394217, "language_loss": 0.81420219, "learning_rate": 3.908308598252523e-06, "loss": 0.83611137, "num_input_tokens_seen": 44509120, "step": 2060, "time_per_iteration": 2.738499402999878 }, { "auxiliary_loss_clip": 0.01150334, "auxiliary_loss_mlp": 0.01054424, "balance_loss_clip": 1.05367386, "balance_loss_mlp": 1.0315125, "epoch": 0.1239140237486848, "flos": 15116309752320.0, "grad_norm": 1.8699548955873522, "language_loss": 0.86224365, "learning_rate": 3.9081919902806306e-06, "loss": 0.88429129, "num_input_tokens_seen": 44525780, "step": 2061, "time_per_iteration": 2.6492960453033447 }, { "auxiliary_loss_clip": 0.0115523, "auxiliary_loss_mlp": 0.01050307, "balance_loss_clip": 1.05506253, "balance_loss_mlp": 1.03031528, "epoch": 0.12397414700135277, "flos": 21976791788160.0, "grad_norm": 2.006361909654615, "language_loss": 0.84949362, "learning_rate": 3.908075309949906e-06, "loss": 0.87154901, "num_input_tokens_seen": 44543125, "step": 2062, "time_per_iteration": 2.5925393104553223 }, { "auxiliary_loss_clip": 0.01124676, "auxiliary_loss_mlp": 0.01058304, "balance_loss_clip": 1.05198252, "balance_loss_mlp": 1.03498697, "epoch": 0.12403427025402074, "flos": 13400892956160.0, "grad_norm": 1.6181471799462952, "language_loss": 0.78765064, "learning_rate": 3.907958557264774e-06, "loss": 0.80948043, "num_input_tokens_seen": 44560275, "step": 2063, "time_per_iteration": 2.7551674842834473 }, { "auxiliary_loss_clip": 0.01124369, "auxiliary_loss_mlp": 0.01057465, "balance_loss_clip": 1.05492854, "balance_loss_mlp": 1.03450513, "epoch": 0.12409439350668872, "flos": 15304374385920.0, "grad_norm": 2.9315517002695017, "language_loss": 0.79452097, "learning_rate": 3.907841732229663e-06, "loss": 0.81633931, "num_input_tokens_seen": 44577640, "step": 2064, "time_per_iteration": 2.699711322784424 }, { "auxiliary_loss_clip": 0.01144709, "auxiliary_loss_mlp": 0.01058768, "balance_loss_clip": 1.05316699, "balance_loss_mlp": 1.03847849, "epoch": 0.12415451675935668, "flos": 25009376313600.0, "grad_norm": 2.5611248351266016, "language_loss": 0.92676973, "learning_rate": 3.907724834849002e-06, "loss": 0.9488045, "num_input_tokens_seen": 44594860, "step": 2065, "time_per_iteration": 2.7114996910095215 }, { "auxiliary_loss_clip": 0.01147841, "auxiliary_loss_mlp": 0.01052058, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.02943158, "epoch": 0.12421464001202465, "flos": 23659673840640.0, "grad_norm": 1.7498294279318665, "language_loss": 0.80540735, "learning_rate": 3.907607865127225e-06, "loss": 0.82740629, "num_input_tokens_seen": 44614780, "step": 2066, "time_per_iteration": 2.6958389282226562 }, { "auxiliary_loss_clip": 0.01030831, "auxiliary_loss_mlp": 0.01051436, "balance_loss_clip": 1.02768898, "balance_loss_mlp": 1.04884958, "epoch": 0.12427476326469263, "flos": 65732904345600.0, "grad_norm": 0.8715885531008962, "language_loss": 0.63299954, "learning_rate": 3.907490823068766e-06, "loss": 0.6538223, "num_input_tokens_seen": 44671240, "step": 2067, "time_per_iteration": 3.200000762939453 }, { "auxiliary_loss_clip": 0.01117858, "auxiliary_loss_mlp": 0.01057985, "balance_loss_clip": 1.04878855, "balance_loss_mlp": 1.0344646, "epoch": 0.12433488651736059, "flos": 24535427333760.0, "grad_norm": 1.9218217735084064, "language_loss": 0.93783462, "learning_rate": 3.907373708678063e-06, "loss": 0.959593, "num_input_tokens_seen": 44691050, "step": 2068, "time_per_iteration": 2.7631025314331055 }, { "auxiliary_loss_clip": 0.01166393, "auxiliary_loss_mlp": 0.0105657, "balance_loss_clip": 1.05994427, "balance_loss_mlp": 1.03697169, "epoch": 0.12439500977002856, "flos": 21031659175680.0, "grad_norm": 1.8717926968048342, "language_loss": 0.80861229, "learning_rate": 3.9072565219595596e-06, "loss": 0.83084196, "num_input_tokens_seen": 44709850, "step": 2069, "time_per_iteration": 2.6630098819732666 }, { "auxiliary_loss_clip": 0.01113262, "auxiliary_loss_mlp": 0.01062592, "balance_loss_clip": 1.04863238, "balance_loss_mlp": 1.03963184, "epoch": 0.12445513302269653, "flos": 26830621555200.0, "grad_norm": 1.5649570979854035, "language_loss": 0.777978, "learning_rate": 3.907139262917696e-06, "loss": 0.79973656, "num_input_tokens_seen": 44731475, "step": 2070, "time_per_iteration": 2.7750463485717773 }, { "auxiliary_loss_clip": 0.01156875, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.05520415, "balance_loss_mlp": 1.03055048, "epoch": 0.1245152562753645, "flos": 18368919037440.0, "grad_norm": 2.2051981544638166, "language_loss": 0.80743957, "learning_rate": 3.907021931556922e-06, "loss": 0.8295334, "num_input_tokens_seen": 44749685, "step": 2071, "time_per_iteration": 2.654171943664551 }, { "auxiliary_loss_clip": 0.01154683, "auxiliary_loss_mlp": 0.01055767, "balance_loss_clip": 1.05492425, "balance_loss_mlp": 1.03405952, "epoch": 0.12457537952803246, "flos": 33107986200960.0, "grad_norm": 2.118828414072521, "language_loss": 0.78278041, "learning_rate": 3.906904527881684e-06, "loss": 0.80488491, "num_input_tokens_seen": 44772165, "step": 2072, "time_per_iteration": 2.753159284591675 }, { "auxiliary_loss_clip": 0.0114568, "auxiliary_loss_mlp": 0.01055287, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.03381729, "epoch": 0.12463550278070043, "flos": 22270217990400.0, "grad_norm": 7.360489773093417, "language_loss": 0.752267, "learning_rate": 3.9067870518964355e-06, "loss": 0.77427667, "num_input_tokens_seen": 44790580, "step": 2073, "time_per_iteration": 2.6561899185180664 }, { "auxiliary_loss_clip": 0.01096485, "auxiliary_loss_mlp": 0.01053193, "balance_loss_clip": 1.04471385, "balance_loss_mlp": 1.03086543, "epoch": 0.12469562603336841, "flos": 14679025580160.0, "grad_norm": 1.9234955386089483, "language_loss": 0.90560025, "learning_rate": 3.906669503605631e-06, "loss": 0.92709696, "num_input_tokens_seen": 44806730, "step": 2074, "time_per_iteration": 2.7846343517303467 }, { "auxiliary_loss_clip": 0.01105332, "auxiliary_loss_mlp": 0.01056651, "balance_loss_clip": 1.04977274, "balance_loss_mlp": 1.03346491, "epoch": 0.12475574928603637, "flos": 24644775312000.0, "grad_norm": 2.8321626325497493, "language_loss": 0.83836985, "learning_rate": 3.906551883013728e-06, "loss": 0.8599897, "num_input_tokens_seen": 44825550, "step": 2075, "time_per_iteration": 4.412928342819214 }, { "auxiliary_loss_clip": 0.01107078, "auxiliary_loss_mlp": 0.01062819, "balance_loss_clip": 1.04380202, "balance_loss_mlp": 1.03972864, "epoch": 0.12481587253870434, "flos": 21762980081280.0, "grad_norm": 2.042892519020311, "language_loss": 0.73648787, "learning_rate": 3.9064341901251865e-06, "loss": 0.75818682, "num_input_tokens_seen": 44844155, "step": 2076, "time_per_iteration": 5.925223112106323 }, { "auxiliary_loss_clip": 0.01101731, "auxiliary_loss_mlp": 0.01048176, "balance_loss_clip": 1.04774427, "balance_loss_mlp": 1.02751708, "epoch": 0.12487599579137232, "flos": 21432529935360.0, "grad_norm": 1.8779339700875872, "language_loss": 0.7622484, "learning_rate": 3.906316424944469e-06, "loss": 0.78374755, "num_input_tokens_seen": 44863780, "step": 2077, "time_per_iteration": 2.70566987991333 }, { "auxiliary_loss_clip": 0.01156274, "auxiliary_loss_mlp": 0.01062042, "balance_loss_clip": 1.05365288, "balance_loss_mlp": 1.04001164, "epoch": 0.12493611904404028, "flos": 16107624276480.0, "grad_norm": 2.022280968605665, "language_loss": 0.82290226, "learning_rate": 3.906198587476043e-06, "loss": 0.84508544, "num_input_tokens_seen": 44881480, "step": 2078, "time_per_iteration": 4.302385568618774 }, { "auxiliary_loss_clip": 0.01144821, "auxiliary_loss_mlp": 0.01050482, "balance_loss_clip": 1.05281842, "balance_loss_mlp": 1.02855957, "epoch": 0.12499624229670825, "flos": 21580266574080.0, "grad_norm": 1.6413520418295044, "language_loss": 0.75195324, "learning_rate": 3.906080677724374e-06, "loss": 0.77390629, "num_input_tokens_seen": 44900390, "step": 2079, "time_per_iteration": 2.6915946006774902 }, { "auxiliary_loss_clip": 0.01166758, "auxiliary_loss_mlp": 0.01058474, "balance_loss_clip": 1.05881989, "balance_loss_mlp": 1.03696847, "epoch": 0.1250563655493762, "flos": 25699040421120.0, "grad_norm": 6.733284446627088, "language_loss": 0.83874094, "learning_rate": 3.905962695693935e-06, "loss": 0.86099327, "num_input_tokens_seen": 44920375, "step": 2080, "time_per_iteration": 2.7467572689056396 }, { "auxiliary_loss_clip": 0.01156163, "auxiliary_loss_mlp": 0.01059409, "balance_loss_clip": 1.05525088, "balance_loss_mlp": 1.03885686, "epoch": 0.12511648880204418, "flos": 16909509450240.0, "grad_norm": 1.8581885454518776, "language_loss": 0.84644079, "learning_rate": 3.9058446413892e-06, "loss": 0.86859655, "num_input_tokens_seen": 44938415, "step": 2081, "time_per_iteration": 2.685875654220581 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01046398, "balance_loss_clip": 1.05375946, "balance_loss_mlp": 1.02594149, "epoch": 0.12517661205471217, "flos": 17567500740480.0, "grad_norm": 1.8191819349610059, "language_loss": 0.76739037, "learning_rate": 3.905726514814646e-06, "loss": 0.78942269, "num_input_tokens_seen": 44957135, "step": 2082, "time_per_iteration": 2.6133053302764893 }, { "auxiliary_loss_clip": 0.01152911, "auxiliary_loss_mlp": 0.0104632, "balance_loss_clip": 1.05701911, "balance_loss_mlp": 1.02463615, "epoch": 0.12523673530738014, "flos": 16033791870720.0, "grad_norm": 2.5415589476696265, "language_loss": 0.79044539, "learning_rate": 3.9056083159747495e-06, "loss": 0.81243765, "num_input_tokens_seen": 44974480, "step": 2083, "time_per_iteration": 2.6963307857513428 }, { "auxiliary_loss_clip": 0.01147874, "auxiliary_loss_mlp": 0.01047351, "balance_loss_clip": 1.05509973, "balance_loss_mlp": 1.02421284, "epoch": 0.1252968585600481, "flos": 18807747494400.0, "grad_norm": 2.1696249857299, "language_loss": 0.89831448, "learning_rate": 3.9054900448739966e-06, "loss": 0.92026675, "num_input_tokens_seen": 44990310, "step": 2084, "time_per_iteration": 2.6770403385162354 }, { "auxiliary_loss_clip": 0.01131068, "auxiliary_loss_mlp": 0.01048299, "balance_loss_clip": 1.05299771, "balance_loss_mlp": 1.02729464, "epoch": 0.12535698181271607, "flos": 27271568914560.0, "grad_norm": 1.8896331095253402, "language_loss": 0.80354226, "learning_rate": 3.905371701516869e-06, "loss": 0.82533598, "num_input_tokens_seen": 45010720, "step": 2085, "time_per_iteration": 2.749783515930176 }, { "auxiliary_loss_clip": 0.01170318, "auxiliary_loss_mlp": 0.01051018, "balance_loss_clip": 1.05725896, "balance_loss_mlp": 1.03001356, "epoch": 0.12541710506538403, "flos": 22054107813120.0, "grad_norm": 1.8300316094254767, "language_loss": 0.88228154, "learning_rate": 3.905253285907856e-06, "loss": 0.90449488, "num_input_tokens_seen": 45030360, "step": 2086, "time_per_iteration": 2.603515148162842 }, { "auxiliary_loss_clip": 0.01134598, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.05278981, "balance_loss_mlp": 1.02522027, "epoch": 0.125477228318052, "flos": 12603173760000.0, "grad_norm": 2.0471238132540344, "language_loss": 0.86819696, "learning_rate": 3.905134798051447e-06, "loss": 0.88999224, "num_input_tokens_seen": 45045085, "step": 2087, "time_per_iteration": 2.6265859603881836 }, { "auxiliary_loss_clip": 0.01146999, "auxiliary_loss_mlp": 0.01058875, "balance_loss_clip": 1.05599046, "balance_loss_mlp": 1.03651142, "epoch": 0.12553735157071996, "flos": 23878549365120.0, "grad_norm": 2.3362397674907758, "language_loss": 0.73027468, "learning_rate": 3.905016237952136e-06, "loss": 0.75233346, "num_input_tokens_seen": 45065145, "step": 2088, "time_per_iteration": 2.65324330329895 }, { "auxiliary_loss_clip": 0.01062529, "auxiliary_loss_mlp": 0.01013405, "balance_loss_clip": 1.02985716, "balance_loss_mlp": 1.01079392, "epoch": 0.12559747482338796, "flos": 69920841830400.0, "grad_norm": 0.7742255614948045, "language_loss": 0.61767036, "learning_rate": 3.904897605614418e-06, "loss": 0.6384297, "num_input_tokens_seen": 45126230, "step": 2089, "time_per_iteration": 3.1219804286956787 }, { "auxiliary_loss_clip": 0.01149606, "auxiliary_loss_mlp": 0.01060841, "balance_loss_clip": 1.05670094, "balance_loss_mlp": 1.0388943, "epoch": 0.12565759807605592, "flos": 24279563779200.0, "grad_norm": 1.817095421446176, "language_loss": 0.7781918, "learning_rate": 3.904778901042793e-06, "loss": 0.80029625, "num_input_tokens_seen": 45145545, "step": 2090, "time_per_iteration": 2.700425863265991 }, { "auxiliary_loss_clip": 0.01046946, "auxiliary_loss_mlp": 0.01013884, "balance_loss_clip": 1.03125095, "balance_loss_mlp": 1.01101136, "epoch": 0.12571772132872389, "flos": 56451180286080.0, "grad_norm": 0.760599485634597, "language_loss": 0.59434772, "learning_rate": 3.90466012424176e-06, "loss": 0.61495602, "num_input_tokens_seen": 45206845, "step": 2091, "time_per_iteration": 3.0814294815063477 }, { "auxiliary_loss_clip": 0.01159814, "auxiliary_loss_mlp": 0.01060546, "balance_loss_clip": 1.05760789, "balance_loss_mlp": 1.041067, "epoch": 0.12577784458139185, "flos": 41245846675200.0, "grad_norm": 1.6552462178493936, "language_loss": 0.62916517, "learning_rate": 3.904541275215825e-06, "loss": 0.6513688, "num_input_tokens_seen": 45228495, "step": 2092, "time_per_iteration": 2.7813880443573 }, { "auxiliary_loss_clip": 0.01147016, "auxiliary_loss_mlp": 0.01061963, "balance_loss_clip": 1.05395663, "balance_loss_mlp": 1.04069614, "epoch": 0.12583796783405982, "flos": 19755501799680.0, "grad_norm": 2.279616692029291, "language_loss": 0.80507946, "learning_rate": 3.904422353969493e-06, "loss": 0.82716924, "num_input_tokens_seen": 45245720, "step": 2093, "time_per_iteration": 2.6768014430999756 }, { "auxiliary_loss_clip": 0.01146976, "auxiliary_loss_mlp": 0.01075616, "balance_loss_clip": 1.0524025, "balance_loss_mlp": 1.05380058, "epoch": 0.12589809108672778, "flos": 22602104680320.0, "grad_norm": 1.7347385846840702, "language_loss": 0.76003867, "learning_rate": 3.904303360507276e-06, "loss": 0.78226459, "num_input_tokens_seen": 45265650, "step": 2094, "time_per_iteration": 2.6730611324310303 }, { "auxiliary_loss_clip": 0.01117887, "auxiliary_loss_mlp": 0.01069309, "balance_loss_clip": 1.0500071, "balance_loss_mlp": 1.04892457, "epoch": 0.12595821433939577, "flos": 45222845541120.0, "grad_norm": 1.5703706409155747, "language_loss": 0.76664734, "learning_rate": 3.9041842948336835e-06, "loss": 0.78851926, "num_input_tokens_seen": 45287790, "step": 2095, "time_per_iteration": 2.958367109298706 }, { "auxiliary_loss_clip": 0.01147751, "auxiliary_loss_mlp": 0.01058477, "balance_loss_clip": 1.05202031, "balance_loss_mlp": 1.03782988, "epoch": 0.12601833759206374, "flos": 14319811618560.0, "grad_norm": 2.2556524892449326, "language_loss": 0.83266854, "learning_rate": 3.904065156953232e-06, "loss": 0.85473078, "num_input_tokens_seen": 45305720, "step": 2096, "time_per_iteration": 2.7097342014312744 }, { "auxiliary_loss_clip": 0.01163652, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.05806553, "balance_loss_mlp": 1.03577375, "epoch": 0.1260784608447317, "flos": 21288241002240.0, "grad_norm": 1.7589400475615893, "language_loss": 0.75478256, "learning_rate": 3.903945946870439e-06, "loss": 0.77698463, "num_input_tokens_seen": 45325290, "step": 2097, "time_per_iteration": 2.642056703567505 }, { "auxiliary_loss_clip": 0.01156719, "auxiliary_loss_mlp": 0.01063976, "balance_loss_clip": 1.05648863, "balance_loss_mlp": 1.04527175, "epoch": 0.12613858409739967, "flos": 26251311006720.0, "grad_norm": 1.8828235460619742, "language_loss": 0.87110066, "learning_rate": 3.9038266645898246e-06, "loss": 0.89330757, "num_input_tokens_seen": 45344465, "step": 2098, "time_per_iteration": 2.63826584815979 }, { "auxiliary_loss_clip": 0.01117414, "auxiliary_loss_mlp": 0.01058025, "balance_loss_clip": 1.04983974, "balance_loss_mlp": 1.03475559, "epoch": 0.12619870735006763, "flos": 21579979265280.0, "grad_norm": 1.8855647331078333, "language_loss": 0.69494271, "learning_rate": 3.903707310115912e-06, "loss": 0.7166971, "num_input_tokens_seen": 45362465, "step": 2099, "time_per_iteration": 2.7813057899475098 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01061431, "balance_loss_clip": 1.04979372, "balance_loss_mlp": 1.03923464, "epoch": 0.1262588306027356, "flos": 23367037737600.0, "grad_norm": 2.0457253500590498, "language_loss": 0.81949925, "learning_rate": 3.903587883453228e-06, "loss": 0.84154058, "num_input_tokens_seen": 45382700, "step": 2100, "time_per_iteration": 2.704871416091919 }, { "auxiliary_loss_clip": 0.01159613, "auxiliary_loss_mlp": 0.01055067, "balance_loss_clip": 1.0620985, "balance_loss_mlp": 1.03408623, "epoch": 0.12631895385540357, "flos": 23949185460480.0, "grad_norm": 1.7810176086536167, "language_loss": 0.80399859, "learning_rate": 3.903468384606302e-06, "loss": 0.82614541, "num_input_tokens_seen": 45401005, "step": 2101, "time_per_iteration": 2.7071452140808105 }, { "auxiliary_loss_clip": 0.0106985, "auxiliary_loss_mlp": 0.01010859, "balance_loss_clip": 1.02823138, "balance_loss_mlp": 1.00803375, "epoch": 0.12637907710807156, "flos": 70282138780800.0, "grad_norm": 0.7128618749962091, "language_loss": 0.57087427, "learning_rate": 3.903348813579662e-06, "loss": 0.59168136, "num_input_tokens_seen": 45466555, "step": 2102, "time_per_iteration": 3.20320987701416 }, { "auxiliary_loss_clip": 0.01140495, "auxiliary_loss_mlp": 0.01056574, "balance_loss_clip": 1.053671, "balance_loss_mlp": 1.03661788, "epoch": 0.12643920036073952, "flos": 18915084311040.0, "grad_norm": 2.0306165352193988, "language_loss": 0.93653679, "learning_rate": 3.903229170377845e-06, "loss": 0.95850742, "num_input_tokens_seen": 45485165, "step": 2103, "time_per_iteration": 2.6628894805908203 }, { "auxiliary_loss_clip": 0.01144405, "auxiliary_loss_mlp": 0.01040745, "balance_loss_clip": 1.04991472, "balance_loss_mlp": 1.02174282, "epoch": 0.1264993236134075, "flos": 27782470010880.0, "grad_norm": 1.5962316578756222, "language_loss": 0.7804662, "learning_rate": 3.903109455005387e-06, "loss": 0.80231774, "num_input_tokens_seen": 45504630, "step": 2104, "time_per_iteration": 2.6215474605560303 }, { "auxiliary_loss_clip": 0.01135927, "auxiliary_loss_mlp": 0.01056343, "balance_loss_clip": 1.05414486, "balance_loss_mlp": 1.03683996, "epoch": 0.12655944686607545, "flos": 24754697907840.0, "grad_norm": 1.7362499149688688, "language_loss": 0.80728614, "learning_rate": 3.902989667466828e-06, "loss": 0.82920885, "num_input_tokens_seen": 45524885, "step": 2105, "time_per_iteration": 2.74128794670105 }, { "auxiliary_loss_clip": 0.01162904, "auxiliary_loss_mlp": 0.01056367, "balance_loss_clip": 1.05482686, "balance_loss_mlp": 1.03514743, "epoch": 0.12661957011874342, "flos": 24133048202880.0, "grad_norm": 1.9810187943106816, "language_loss": 0.83402872, "learning_rate": 3.90286980776671e-06, "loss": 0.85622144, "num_input_tokens_seen": 45545000, "step": 2106, "time_per_iteration": 2.676694631576538 }, { "auxiliary_loss_clip": 0.01126632, "auxiliary_loss_mlp": 0.01052067, "balance_loss_clip": 1.05697966, "balance_loss_mlp": 1.03147984, "epoch": 0.12667969337141138, "flos": 24569614103040.0, "grad_norm": 1.6951691508845637, "language_loss": 0.73469931, "learning_rate": 3.902749875909578e-06, "loss": 0.7564863, "num_input_tokens_seen": 45564210, "step": 2107, "time_per_iteration": 2.7506372928619385 }, { "auxiliary_loss_clip": 0.01162931, "auxiliary_loss_mlp": 0.01044317, "balance_loss_clip": 1.05320692, "balance_loss_mlp": 1.02599406, "epoch": 0.12673981662407935, "flos": 22961677777920.0, "grad_norm": 2.0116792159666477, "language_loss": 0.79395336, "learning_rate": 3.90262987189998e-06, "loss": 0.81602579, "num_input_tokens_seen": 45583030, "step": 2108, "time_per_iteration": 2.6611146926879883 }, { "auxiliary_loss_clip": 0.01168073, "auxiliary_loss_mlp": 0.01049192, "balance_loss_clip": 1.05300844, "balance_loss_mlp": 1.02945089, "epoch": 0.12679993987674734, "flos": 17274864637440.0, "grad_norm": 1.9298328790617403, "language_loss": 0.7561394, "learning_rate": 3.902509795742467e-06, "loss": 0.77831209, "num_input_tokens_seen": 45602265, "step": 2109, "time_per_iteration": 2.5963573455810547 }, { "auxiliary_loss_clip": 0.01111025, "auxiliary_loss_mlp": 0.01053822, "balance_loss_clip": 1.04636049, "balance_loss_mlp": 1.0335331, "epoch": 0.1268600631294153, "flos": 17275080119040.0, "grad_norm": 1.6171901700648081, "language_loss": 0.82806516, "learning_rate": 3.902389647441592e-06, "loss": 0.84971368, "num_input_tokens_seen": 45620595, "step": 2110, "time_per_iteration": 2.6745550632476807 }, { "auxiliary_loss_clip": 0.01145969, "auxiliary_loss_mlp": 0.00778071, "balance_loss_clip": 1.05419564, "balance_loss_mlp": 0.99996144, "epoch": 0.12692018638208327, "flos": 24061047390720.0, "grad_norm": 1.6765217216011241, "language_loss": 0.78092968, "learning_rate": 3.90226942700191e-06, "loss": 0.80017006, "num_input_tokens_seen": 45641140, "step": 2111, "time_per_iteration": 2.65983510017395 }, { "auxiliary_loss_clip": 0.01130932, "auxiliary_loss_mlp": 0.01076547, "balance_loss_clip": 1.05490458, "balance_loss_mlp": 1.05352807, "epoch": 0.12698030963475124, "flos": 31831900652160.0, "grad_norm": 2.15738266202174, "language_loss": 0.77103376, "learning_rate": 3.902149134427982e-06, "loss": 0.79310858, "num_input_tokens_seen": 45662315, "step": 2112, "time_per_iteration": 2.870299816131592 }, { "auxiliary_loss_clip": 0.01129438, "auxiliary_loss_mlp": 0.01074863, "balance_loss_clip": 1.05213726, "balance_loss_mlp": 1.05427516, "epoch": 0.1270404328874192, "flos": 25187744275200.0, "grad_norm": 1.9191529425470424, "language_loss": 0.85806453, "learning_rate": 3.902028769724367e-06, "loss": 0.88010758, "num_input_tokens_seen": 45680335, "step": 2113, "time_per_iteration": 4.26338267326355 }, { "auxiliary_loss_clip": 0.01137468, "auxiliary_loss_mlp": 0.01078067, "balance_loss_clip": 1.05511892, "balance_loss_mlp": 1.05670488, "epoch": 0.12710055614008717, "flos": 15997342544640.0, "grad_norm": 1.9721234476704599, "language_loss": 0.74027002, "learning_rate": 3.9019083328956315e-06, "loss": 0.7624253, "num_input_tokens_seen": 45696240, "step": 2114, "time_per_iteration": 2.7573230266571045 }, { "auxiliary_loss_clip": 0.01156713, "auxiliary_loss_mlp": 0.01060574, "balance_loss_clip": 1.05770111, "balance_loss_mlp": 1.03924704, "epoch": 0.12716067939275516, "flos": 15085642515840.0, "grad_norm": 1.7921743813213327, "language_loss": 0.83240676, "learning_rate": 3.901787823946341e-06, "loss": 0.85457963, "num_input_tokens_seen": 45713695, "step": 2115, "time_per_iteration": 4.1369829177856445 }, { "auxiliary_loss_clip": 0.01154653, "auxiliary_loss_mlp": 0.01065557, "balance_loss_clip": 1.05875492, "balance_loss_mlp": 1.04476702, "epoch": 0.12722080264542313, "flos": 28366736636160.0, "grad_norm": 1.4840591347809418, "language_loss": 0.87010503, "learning_rate": 3.901667242881065e-06, "loss": 0.89230716, "num_input_tokens_seen": 45736655, "step": 2116, "time_per_iteration": 2.73896861076355 }, { "auxiliary_loss_clip": 0.01139498, "auxiliary_loss_mlp": 0.00777066, "balance_loss_clip": 1.05413389, "balance_loss_mlp": 0.99995339, "epoch": 0.1272809258980911, "flos": 32379897519360.0, "grad_norm": 1.753205985010591, "language_loss": 0.70374918, "learning_rate": 3.9015465897043775e-06, "loss": 0.72291481, "num_input_tokens_seen": 45758195, "step": 2117, "time_per_iteration": 2.783156156539917 }, { "auxiliary_loss_clip": 0.01127455, "auxiliary_loss_mlp": 0.0106424, "balance_loss_clip": 1.04978406, "balance_loss_mlp": 1.04068434, "epoch": 0.12734104915075906, "flos": 16034402401920.0, "grad_norm": 1.9957647698478755, "language_loss": 0.86237884, "learning_rate": 3.901425864420852e-06, "loss": 0.8842957, "num_input_tokens_seen": 45774280, "step": 2118, "time_per_iteration": 4.322036266326904 }, { "auxiliary_loss_clip": 0.01161417, "auxiliary_loss_mlp": 0.01049008, "balance_loss_clip": 1.05827069, "balance_loss_mlp": 1.02951694, "epoch": 0.12740117240342702, "flos": 18260325244800.0, "grad_norm": 1.705293179953873, "language_loss": 0.87577266, "learning_rate": 3.901305067035068e-06, "loss": 0.89787692, "num_input_tokens_seen": 45792760, "step": 2119, "time_per_iteration": 2.6559741497039795 }, { "auxiliary_loss_clip": 0.01145426, "auxiliary_loss_mlp": 0.0077754, "balance_loss_clip": 1.05233431, "balance_loss_mlp": 0.99984539, "epoch": 0.127461295656095, "flos": 12121790664960.0, "grad_norm": 2.05013605026053, "language_loss": 0.87824571, "learning_rate": 3.901184197551605e-06, "loss": 0.89747536, "num_input_tokens_seen": 45804300, "step": 2120, "time_per_iteration": 2.6154048442840576 }, { "auxiliary_loss_clip": 0.01170497, "auxiliary_loss_mlp": 0.01046075, "balance_loss_clip": 1.05822706, "balance_loss_mlp": 1.02626204, "epoch": 0.12752141890876295, "flos": 23149095966720.0, "grad_norm": 1.9784951602308867, "language_loss": 0.75584805, "learning_rate": 3.901063255975046e-06, "loss": 0.77801377, "num_input_tokens_seen": 45823780, "step": 2121, "time_per_iteration": 2.579265832901001 }, { "auxiliary_loss_clip": 0.0111249, "auxiliary_loss_mlp": 0.01047949, "balance_loss_clip": 1.04741263, "balance_loss_mlp": 1.02727842, "epoch": 0.12758154216143094, "flos": 21615997628160.0, "grad_norm": 2.0293629108662405, "language_loss": 0.82732606, "learning_rate": 3.900942242309978e-06, "loss": 0.84893048, "num_input_tokens_seen": 45840495, "step": 2122, "time_per_iteration": 2.793870210647583 }, { "auxiliary_loss_clip": 0.01151713, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.05901408, "balance_loss_mlp": 1.02983987, "epoch": 0.1276416654140989, "flos": 15924874855680.0, "grad_norm": 1.7660235451894624, "language_loss": 0.78699338, "learning_rate": 3.90082115656099e-06, "loss": 0.80900776, "num_input_tokens_seen": 45857735, "step": 2123, "time_per_iteration": 2.70546293258667 }, { "auxiliary_loss_clip": 0.01172823, "auxiliary_loss_mlp": 0.01055328, "balance_loss_clip": 1.05931985, "balance_loss_mlp": 1.03478789, "epoch": 0.12770178866676687, "flos": 22382690451840.0, "grad_norm": 1.5643885422181942, "language_loss": 0.78931451, "learning_rate": 3.900699998732673e-06, "loss": 0.81159604, "num_input_tokens_seen": 45876485, "step": 2124, "time_per_iteration": 2.661712408065796 }, { "auxiliary_loss_clip": 0.01160474, "auxiliary_loss_mlp": 0.00776885, "balance_loss_clip": 1.05457389, "balance_loss_mlp": 0.99987447, "epoch": 0.12776191191943484, "flos": 21652482867840.0, "grad_norm": 1.9695028631977674, "language_loss": 0.75605726, "learning_rate": 3.900578768829623e-06, "loss": 0.7754308, "num_input_tokens_seen": 45894645, "step": 2125, "time_per_iteration": 2.696021556854248 }, { "auxiliary_loss_clip": 0.01158163, "auxiliary_loss_mlp": 0.00777059, "balance_loss_clip": 1.05398965, "balance_loss_mlp": 1.00002348, "epoch": 0.1278220351721028, "flos": 25735561574400.0, "grad_norm": 3.019802885219414, "language_loss": 0.78016824, "learning_rate": 3.900457466856434e-06, "loss": 0.79952049, "num_input_tokens_seen": 45913755, "step": 2126, "time_per_iteration": 2.721435308456421 }, { "auxiliary_loss_clip": 0.01124637, "auxiliary_loss_mlp": 0.010537, "balance_loss_clip": 1.05406642, "balance_loss_mlp": 1.03504348, "epoch": 0.12788215842477077, "flos": 41243224982400.0, "grad_norm": 1.3825945270792501, "language_loss": 0.6927852, "learning_rate": 3.9003360928177085e-06, "loss": 0.71456861, "num_input_tokens_seen": 45936095, "step": 2127, "time_per_iteration": 2.902101993560791 }, { "auxiliary_loss_clip": 0.01030231, "auxiliary_loss_mlp": 0.00759051, "balance_loss_clip": 1.02830005, "balance_loss_mlp": 1.00050259, "epoch": 0.12794228167743876, "flos": 70877430881280.0, "grad_norm": 0.853491438999862, "language_loss": 0.62831402, "learning_rate": 3.900214646718047e-06, "loss": 0.64620686, "num_input_tokens_seen": 46004655, "step": 2128, "time_per_iteration": 3.3387396335601807 }, { "auxiliary_loss_clip": 0.01145823, "auxiliary_loss_mlp": 0.01047815, "balance_loss_clip": 1.05080712, "balance_loss_mlp": 1.02599955, "epoch": 0.12800240493010673, "flos": 16289727252480.0, "grad_norm": 2.066959353069841, "language_loss": 0.77626479, "learning_rate": 3.900093128562056e-06, "loss": 0.7982012, "num_input_tokens_seen": 46023610, "step": 2129, "time_per_iteration": 2.611309766769409 }, { "auxiliary_loss_clip": 0.01122914, "auxiliary_loss_mlp": 0.01052577, "balance_loss_clip": 1.05058527, "balance_loss_mlp": 1.03029668, "epoch": 0.1280625281827747, "flos": 20631542601600.0, "grad_norm": 2.1214737401843893, "language_loss": 0.79263359, "learning_rate": 3.899971538354343e-06, "loss": 0.81438851, "num_input_tokens_seen": 46041725, "step": 2130, "time_per_iteration": 2.753243923187256 }, { "auxiliary_loss_clip": 0.01139626, "auxiliary_loss_mlp": 0.01052453, "balance_loss_clip": 1.05133748, "balance_loss_mlp": 1.03147244, "epoch": 0.12812265143544266, "flos": 22638230784000.0, "grad_norm": 1.7780274650921335, "language_loss": 0.70945668, "learning_rate": 3.899849876099518e-06, "loss": 0.73137754, "num_input_tokens_seen": 46061095, "step": 2131, "time_per_iteration": 2.6809306144714355 }, { "auxiliary_loss_clip": 0.01102824, "auxiliary_loss_mlp": 0.01052393, "balance_loss_clip": 1.04982638, "balance_loss_mlp": 1.03163886, "epoch": 0.12818277468811062, "flos": 34714701463680.0, "grad_norm": 2.2916674504462655, "language_loss": 0.72298968, "learning_rate": 3.899728141802197e-06, "loss": 0.74454176, "num_input_tokens_seen": 46082670, "step": 2132, "time_per_iteration": 2.8769233226776123 }, { "auxiliary_loss_clip": 0.01102594, "auxiliary_loss_mlp": 0.01055993, "balance_loss_clip": 1.04384947, "balance_loss_mlp": 1.03348672, "epoch": 0.1282428979407786, "flos": 23112107936640.0, "grad_norm": 2.0316054281953155, "language_loss": 0.82128644, "learning_rate": 3.8996063354669935e-06, "loss": 0.84287226, "num_input_tokens_seen": 46102410, "step": 2133, "time_per_iteration": 2.766897678375244 }, { "auxiliary_loss_clip": 0.01163396, "auxiliary_loss_mlp": 0.01057069, "balance_loss_clip": 1.05397773, "balance_loss_mlp": 1.03458595, "epoch": 0.12830302119344655, "flos": 20886508316160.0, "grad_norm": 3.232115826630309, "language_loss": 0.80001891, "learning_rate": 3.899484457098528e-06, "loss": 0.82222354, "num_input_tokens_seen": 46121145, "step": 2134, "time_per_iteration": 2.6347672939300537 }, { "auxiliary_loss_clip": 0.01159056, "auxiliary_loss_mlp": 0.01046209, "balance_loss_clip": 1.05907345, "balance_loss_mlp": 1.02614641, "epoch": 0.12836314444611455, "flos": 21397768548480.0, "grad_norm": 1.731952504909339, "language_loss": 0.82657921, "learning_rate": 3.899362506701421e-06, "loss": 0.84863198, "num_input_tokens_seen": 46140740, "step": 2135, "time_per_iteration": 2.6393656730651855 }, { "auxiliary_loss_clip": 0.0114208, "auxiliary_loss_mlp": 0.0105553, "balance_loss_clip": 1.05345035, "balance_loss_mlp": 1.03411996, "epoch": 0.1284232676987825, "flos": 13662466773120.0, "grad_norm": 2.1083924470752278, "language_loss": 0.7764526, "learning_rate": 3.899240484280298e-06, "loss": 0.79842871, "num_input_tokens_seen": 46156805, "step": 2136, "time_per_iteration": 2.7195920944213867 }, { "auxiliary_loss_clip": 0.01020946, "auxiliary_loss_mlp": 0.01003991, "balance_loss_clip": 1.01967573, "balance_loss_mlp": 1.00096273, "epoch": 0.12848339095145048, "flos": 59994737735040.0, "grad_norm": 0.8964253308146478, "language_loss": 0.59152198, "learning_rate": 3.899118389839785e-06, "loss": 0.61177135, "num_input_tokens_seen": 46222085, "step": 2137, "time_per_iteration": 3.416015625 }, { "auxiliary_loss_clip": 0.01153694, "auxiliary_loss_mlp": 0.01054623, "balance_loss_clip": 1.05178177, "balance_loss_mlp": 1.03483438, "epoch": 0.12854351420411844, "flos": 13881378211200.0, "grad_norm": 3.244493357011547, "language_loss": 0.82344306, "learning_rate": 3.898996223384512e-06, "loss": 0.84552622, "num_input_tokens_seen": 46239970, "step": 2138, "time_per_iteration": 2.65515398979187 }, { "auxiliary_loss_clip": 0.01159586, "auxiliary_loss_mlp": 0.01049293, "balance_loss_clip": 1.05592752, "balance_loss_mlp": 1.02665496, "epoch": 0.1286036374567864, "flos": 22637943475200.0, "grad_norm": 2.5417837252920323, "language_loss": 0.78691363, "learning_rate": 3.898873984919113e-06, "loss": 0.8090024, "num_input_tokens_seen": 46257740, "step": 2139, "time_per_iteration": 2.651132345199585 }, { "auxiliary_loss_clip": 0.01136892, "auxiliary_loss_mlp": 0.01045928, "balance_loss_clip": 1.05267286, "balance_loss_mlp": 1.02582908, "epoch": 0.12866376070945437, "flos": 16324775948160.0, "grad_norm": 1.9541049485452633, "language_loss": 0.85289955, "learning_rate": 3.8987516744482215e-06, "loss": 0.87472773, "num_input_tokens_seen": 46275445, "step": 2140, "time_per_iteration": 2.730156183242798 }, { "auxiliary_loss_clip": 0.01143134, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.05203128, "balance_loss_mlp": 1.02482224, "epoch": 0.12872388396212234, "flos": 11874546374400.0, "grad_norm": 1.8185491602156885, "language_loss": 0.86268306, "learning_rate": 3.898629291976476e-06, "loss": 0.88455778, "num_input_tokens_seen": 46291710, "step": 2141, "time_per_iteration": 2.62223482131958 }, { "auxiliary_loss_clip": 0.01146971, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.0528295, "balance_loss_mlp": 1.02548814, "epoch": 0.12878400721479033, "flos": 28366700722560.0, "grad_norm": 3.1267362471736684, "language_loss": 0.68282312, "learning_rate": 3.898506837508518e-06, "loss": 0.70475101, "num_input_tokens_seen": 46311335, "step": 2142, "time_per_iteration": 2.71232271194458 }, { "auxiliary_loss_clip": 0.01165678, "auxiliary_loss_mlp": 0.0077895, "balance_loss_clip": 1.05764627, "balance_loss_mlp": 0.99990749, "epoch": 0.1288441304674583, "flos": 25885632597120.0, "grad_norm": 2.373838274123079, "language_loss": 0.83479214, "learning_rate": 3.89838431104899e-06, "loss": 0.85423845, "num_input_tokens_seen": 46330985, "step": 2143, "time_per_iteration": 2.677692174911499 }, { "auxiliary_loss_clip": 0.01175134, "auxiliary_loss_mlp": 0.00777405, "balance_loss_clip": 1.0598439, "balance_loss_mlp": 0.99994075, "epoch": 0.12890425372012626, "flos": 20813789232000.0, "grad_norm": 1.5662270309624111, "language_loss": 0.81703234, "learning_rate": 3.898261712602539e-06, "loss": 0.83655775, "num_input_tokens_seen": 46351295, "step": 2144, "time_per_iteration": 2.712620496749878 }, { "auxiliary_loss_clip": 0.01130321, "auxiliary_loss_mlp": 0.01053521, "balance_loss_clip": 1.04658103, "balance_loss_mlp": 1.03145528, "epoch": 0.12896437697279423, "flos": 22565870835840.0, "grad_norm": 1.8026346290528672, "language_loss": 0.78304374, "learning_rate": 3.898139042173813e-06, "loss": 0.80488217, "num_input_tokens_seen": 46368600, "step": 2145, "time_per_iteration": 2.6766605377197266 }, { "auxiliary_loss_clip": 0.01170585, "auxiliary_loss_mlp": 0.01047893, "balance_loss_clip": 1.0543592, "balance_loss_mlp": 1.02662635, "epoch": 0.1290245002254622, "flos": 17493776075520.0, "grad_norm": 2.147087506474235, "language_loss": 0.82865375, "learning_rate": 3.898016299767465e-06, "loss": 0.85083848, "num_input_tokens_seen": 46387370, "step": 2146, "time_per_iteration": 2.5860395431518555 }, { "auxiliary_loss_clip": 0.01141916, "auxiliary_loss_mlp": 0.0105138, "balance_loss_clip": 1.05367482, "balance_loss_mlp": 1.03062606, "epoch": 0.12908462347813016, "flos": 36315957859200.0, "grad_norm": 2.344626501147968, "language_loss": 0.71275079, "learning_rate": 3.897893485388149e-06, "loss": 0.73468375, "num_input_tokens_seen": 46409570, "step": 2147, "time_per_iteration": 2.7870359420776367 }, { "auxiliary_loss_clip": 0.01147238, "auxiliary_loss_mlp": 0.01052291, "balance_loss_clip": 1.05527067, "balance_loss_mlp": 1.03297925, "epoch": 0.12914474673079815, "flos": 22528703237760.0, "grad_norm": 2.120275205230366, "language_loss": 0.71432978, "learning_rate": 3.897770599040521e-06, "loss": 0.73632509, "num_input_tokens_seen": 46429320, "step": 2148, "time_per_iteration": 2.6865081787109375 }, { "auxiliary_loss_clip": 0.01168479, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.05762172, "balance_loss_mlp": 1.03016782, "epoch": 0.12920486998346611, "flos": 21471888263040.0, "grad_norm": 1.6388902851592406, "language_loss": 0.79064089, "learning_rate": 3.897647640729242e-06, "loss": 0.81282145, "num_input_tokens_seen": 46450155, "step": 2149, "time_per_iteration": 2.6041862964630127 }, { "auxiliary_loss_clip": 0.01159527, "auxiliary_loss_mlp": 0.01046069, "balance_loss_clip": 1.05377793, "balance_loss_mlp": 1.02531469, "epoch": 0.12926499323613408, "flos": 27308556944640.0, "grad_norm": 2.034796374339078, "language_loss": 0.75976646, "learning_rate": 3.897524610458975e-06, "loss": 0.78182244, "num_input_tokens_seen": 46470280, "step": 2150, "time_per_iteration": 2.647224187850952 }, { "auxiliary_loss_clip": 0.01155787, "auxiliary_loss_mlp": 0.01055192, "balance_loss_clip": 1.05445433, "balance_loss_mlp": 1.03491461, "epoch": 0.12932511648880204, "flos": 22091131756800.0, "grad_norm": 2.3830500835005592, "language_loss": 0.70986372, "learning_rate": 3.8974015082343835e-06, "loss": 0.73197353, "num_input_tokens_seen": 46487605, "step": 2151, "time_per_iteration": 2.7008492946624756 }, { "auxiliary_loss_clip": 0.01167835, "auxiliary_loss_mlp": 0.0104951, "balance_loss_clip": 1.05603719, "balance_loss_mlp": 1.03017378, "epoch": 0.12938523974147, "flos": 20302780394880.0, "grad_norm": 2.058334480733051, "language_loss": 0.83964819, "learning_rate": 3.897278334060137e-06, "loss": 0.86182165, "num_input_tokens_seen": 46505100, "step": 2152, "time_per_iteration": 2.6467373371124268 }, { "auxiliary_loss_clip": 0.01158553, "auxiliary_loss_mlp": 0.01058416, "balance_loss_clip": 1.05283821, "balance_loss_mlp": 1.03888893, "epoch": 0.12944536299413797, "flos": 19499961467520.0, "grad_norm": 1.5624811365269535, "language_loss": 0.78585124, "learning_rate": 3.897155087940906e-06, "loss": 0.80802095, "num_input_tokens_seen": 46524020, "step": 2153, "time_per_iteration": 4.286921262741089 }, { "auxiliary_loss_clip": 0.01113716, "auxiliary_loss_mlp": 0.00777812, "balance_loss_clip": 1.04707122, "balance_loss_mlp": 0.99989671, "epoch": 0.12950548624680594, "flos": 27707919333120.0, "grad_norm": 1.6189787343362376, "language_loss": 0.80253434, "learning_rate": 3.897031769881364e-06, "loss": 0.82144964, "num_input_tokens_seen": 46544640, "step": 2154, "time_per_iteration": 2.7602338790893555 }, { "auxiliary_loss_clip": 0.01149958, "auxiliary_loss_mlp": 0.0105188, "balance_loss_clip": 1.05262971, "balance_loss_mlp": 1.03099442, "epoch": 0.12956560949947393, "flos": 17565740974080.0, "grad_norm": 1.8080432584650143, "language_loss": 0.83717728, "learning_rate": 3.896908379886188e-06, "loss": 0.85919571, "num_input_tokens_seen": 46561395, "step": 2155, "time_per_iteration": 5.696707010269165 }, { "auxiliary_loss_clip": 0.01161999, "auxiliary_loss_mlp": 0.01056273, "balance_loss_clip": 1.05426383, "balance_loss_mlp": 1.03611445, "epoch": 0.1296257327521419, "flos": 20740711011840.0, "grad_norm": 2.4972858828122666, "language_loss": 0.76114857, "learning_rate": 3.896784917960055e-06, "loss": 0.78333133, "num_input_tokens_seen": 46579395, "step": 2156, "time_per_iteration": 2.6279313564300537 }, { "auxiliary_loss_clip": 0.01105089, "auxiliary_loss_mlp": 0.01056603, "balance_loss_clip": 1.0510118, "balance_loss_mlp": 1.03679013, "epoch": 0.12968585600480986, "flos": 16395735265920.0, "grad_norm": 1.6652476704410177, "language_loss": 0.86493659, "learning_rate": 3.896661384107648e-06, "loss": 0.88655347, "num_input_tokens_seen": 46597090, "step": 2157, "time_per_iteration": 4.4089202880859375 }, { "auxiliary_loss_clip": 0.01170107, "auxiliary_loss_mlp": 0.01055814, "balance_loss_clip": 1.05253935, "balance_loss_mlp": 1.0349642, "epoch": 0.12974597925747783, "flos": 28329533124480.0, "grad_norm": 2.5240136552338956, "language_loss": 0.80393612, "learning_rate": 3.896537778333651e-06, "loss": 0.8261953, "num_input_tokens_seen": 46617355, "step": 2158, "time_per_iteration": 2.702765703201294 }, { "auxiliary_loss_clip": 0.01177017, "auxiliary_loss_mlp": 0.01060365, "balance_loss_clip": 1.05905974, "balance_loss_mlp": 1.04050517, "epoch": 0.1298061025101458, "flos": 9683025782400.0, "grad_norm": 2.5307604694159607, "language_loss": 0.74881256, "learning_rate": 3.896414100642752e-06, "loss": 0.77118635, "num_input_tokens_seen": 46633130, "step": 2159, "time_per_iteration": 2.534163475036621 }, { "auxiliary_loss_clip": 0.01122909, "auxiliary_loss_mlp": 0.01058309, "balance_loss_clip": 1.04594469, "balance_loss_mlp": 1.03471708, "epoch": 0.12986622576281376, "flos": 27709535445120.0, "grad_norm": 1.954419432637739, "language_loss": 0.8259204, "learning_rate": 3.89629035103964e-06, "loss": 0.84773254, "num_input_tokens_seen": 46650575, "step": 2160, "time_per_iteration": 2.7358646392822266 }, { "auxiliary_loss_clip": 0.01154348, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.05873609, "balance_loss_mlp": 1.02732301, "epoch": 0.12992634901548175, "flos": 18802719590400.0, "grad_norm": 1.7252123805741888, "language_loss": 0.82310414, "learning_rate": 3.896166529529008e-06, "loss": 0.84512007, "num_input_tokens_seen": 46668780, "step": 2161, "time_per_iteration": 2.7029623985290527 }, { "auxiliary_loss_clip": 0.01145886, "auxiliary_loss_mlp": 0.01060381, "balance_loss_clip": 1.05145073, "balance_loss_mlp": 1.03911448, "epoch": 0.12998647226814972, "flos": 29127575543040.0, "grad_norm": 2.0780374068601253, "language_loss": 0.82668459, "learning_rate": 3.896042636115551e-06, "loss": 0.84874725, "num_input_tokens_seen": 46687550, "step": 2162, "time_per_iteration": 2.674825668334961 }, { "auxiliary_loss_clip": 0.0113921, "auxiliary_loss_mlp": 0.0105953, "balance_loss_clip": 1.05468941, "balance_loss_mlp": 1.03957474, "epoch": 0.13004659552081768, "flos": 19573686132480.0, "grad_norm": 3.928222506771022, "language_loss": 0.72579277, "learning_rate": 3.895918670803968e-06, "loss": 0.7477802, "num_input_tokens_seen": 46706730, "step": 2163, "time_per_iteration": 2.678394079208374 }, { "auxiliary_loss_clip": 0.01173873, "auxiliary_loss_mlp": 0.00778662, "balance_loss_clip": 1.05635965, "balance_loss_mlp": 0.99994016, "epoch": 0.13010671877348565, "flos": 22490709626880.0, "grad_norm": 2.0196348424542827, "language_loss": 0.81330699, "learning_rate": 3.895794633598958e-06, "loss": 0.83283234, "num_input_tokens_seen": 46724250, "step": 2164, "time_per_iteration": 2.6116931438446045 }, { "auxiliary_loss_clip": 0.01119834, "auxiliary_loss_mlp": 0.01050661, "balance_loss_clip": 1.04808033, "balance_loss_mlp": 1.03061032, "epoch": 0.1301668420261536, "flos": 23878226142720.0, "grad_norm": 2.274563635903502, "language_loss": 0.72262049, "learning_rate": 3.8956705245052256e-06, "loss": 0.74432552, "num_input_tokens_seen": 46744105, "step": 2165, "time_per_iteration": 2.7646515369415283 }, { "auxiliary_loss_clip": 0.01109832, "auxiliary_loss_mlp": 0.01048351, "balance_loss_clip": 1.05059505, "balance_loss_mlp": 1.02707219, "epoch": 0.13022696527882158, "flos": 23150065633920.0, "grad_norm": 2.8383873988269217, "language_loss": 0.74749964, "learning_rate": 3.8955463435274765e-06, "loss": 0.76908153, "num_input_tokens_seen": 46764250, "step": 2166, "time_per_iteration": 2.7939398288726807 }, { "auxiliary_loss_clip": 0.01170298, "auxiliary_loss_mlp": 0.01048037, "balance_loss_clip": 1.05364752, "balance_loss_mlp": 1.02827251, "epoch": 0.13028708853148954, "flos": 26908548111360.0, "grad_norm": 1.5379857106114436, "language_loss": 0.83098066, "learning_rate": 3.895422090670421e-06, "loss": 0.85316396, "num_input_tokens_seen": 46786865, "step": 2167, "time_per_iteration": 2.700505495071411 }, { "auxiliary_loss_clip": 0.01108628, "auxiliary_loss_mlp": 0.01059921, "balance_loss_clip": 1.04567361, "balance_loss_mlp": 1.03841531, "epoch": 0.13034721178415754, "flos": 21251468453760.0, "grad_norm": 1.6054044551173634, "language_loss": 0.83578718, "learning_rate": 3.89529776593877e-06, "loss": 0.85747266, "num_input_tokens_seen": 46807030, "step": 2168, "time_per_iteration": 2.839285135269165 }, { "auxiliary_loss_clip": 0.01079188, "auxiliary_loss_mlp": 0.01063413, "balance_loss_clip": 1.04247975, "balance_loss_mlp": 1.03861713, "epoch": 0.1304073350368255, "flos": 18767239931520.0, "grad_norm": 1.950315007602454, "language_loss": 0.79910588, "learning_rate": 3.8951733693372375e-06, "loss": 0.8205319, "num_input_tokens_seen": 46826280, "step": 2169, "time_per_iteration": 2.8150076866149902 }, { "auxiliary_loss_clip": 0.01174566, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.05822575, "balance_loss_mlp": 1.02339983, "epoch": 0.13046745828949347, "flos": 28364653647360.0, "grad_norm": 2.4117618540057766, "language_loss": 0.66804767, "learning_rate": 3.8950489008705406e-06, "loss": 0.69024229, "num_input_tokens_seen": 46846505, "step": 2170, "time_per_iteration": 2.722769021987915 }, { "auxiliary_loss_clip": 0.0114216, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.05424142, "balance_loss_mlp": 1.02637053, "epoch": 0.13052758154216143, "flos": 29605044055680.0, "grad_norm": 1.9089846415842238, "language_loss": 0.66768706, "learning_rate": 3.8949243605434e-06, "loss": 0.68957549, "num_input_tokens_seen": 46867380, "step": 2171, "time_per_iteration": 2.7474682331085205 }, { "auxiliary_loss_clip": 0.01157431, "auxiliary_loss_mlp": 0.01049079, "balance_loss_clip": 1.05283058, "balance_loss_mlp": 1.02701378, "epoch": 0.1305877047948294, "flos": 19390864884480.0, "grad_norm": 2.103440896006443, "language_loss": 0.72157478, "learning_rate": 3.894799748360537e-06, "loss": 0.74363995, "num_input_tokens_seen": 46886810, "step": 2172, "time_per_iteration": 2.8062691688537598 }, { "auxiliary_loss_clip": 0.01131178, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.05676126, "balance_loss_mlp": 1.0248909, "epoch": 0.13064782804749736, "flos": 16873527000960.0, "grad_norm": 1.8662964619330822, "language_loss": 0.75331408, "learning_rate": 3.894675064326678e-06, "loss": 0.77508402, "num_input_tokens_seen": 46905620, "step": 2173, "time_per_iteration": 2.749630928039551 }, { "auxiliary_loss_clip": 0.01132129, "auxiliary_loss_mlp": 0.01056024, "balance_loss_clip": 1.05241716, "balance_loss_mlp": 1.03388715, "epoch": 0.13070795130016533, "flos": 24499085748480.0, "grad_norm": 2.8034072456055426, "language_loss": 0.70175481, "learning_rate": 3.894550308446551e-06, "loss": 0.72363639, "num_input_tokens_seen": 46925120, "step": 2174, "time_per_iteration": 2.723314046859741 }, { "auxiliary_loss_clip": 0.01047643, "auxiliary_loss_mlp": 0.01015006, "balance_loss_clip": 1.02629197, "balance_loss_mlp": 1.01260972, "epoch": 0.13076807455283332, "flos": 71054505953280.0, "grad_norm": 0.7998489021914615, "language_loss": 0.59026134, "learning_rate": 3.894425480724886e-06, "loss": 0.61088777, "num_input_tokens_seen": 46988195, "step": 2175, "time_per_iteration": 3.318049192428589 }, { "auxiliary_loss_clip": 0.01159762, "auxiliary_loss_mlp": 0.01053929, "balance_loss_clip": 1.05441868, "balance_loss_mlp": 1.03342521, "epoch": 0.13082819780550128, "flos": 20264499475200.0, "grad_norm": 2.2309284705459707, "language_loss": 0.80365628, "learning_rate": 3.894300581166417e-06, "loss": 0.82579315, "num_input_tokens_seen": 47004720, "step": 2176, "time_per_iteration": 2.631732702255249 }, { "auxiliary_loss_clip": 0.01169648, "auxiliary_loss_mlp": 0.01047517, "balance_loss_clip": 1.05513525, "balance_loss_mlp": 1.02529645, "epoch": 0.13088832105816925, "flos": 34203441231360.0, "grad_norm": 1.6906214681317566, "language_loss": 0.74661696, "learning_rate": 3.894175609775881e-06, "loss": 0.76878858, "num_input_tokens_seen": 47024255, "step": 2177, "time_per_iteration": 2.701422691345215 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.0105144, "balance_loss_clip": 1.051373, "balance_loss_mlp": 1.02905297, "epoch": 0.13094844431083721, "flos": 17894970057600.0, "grad_norm": 1.8043513019060269, "language_loss": 0.82266748, "learning_rate": 3.894050566558015e-06, "loss": 0.84449303, "num_input_tokens_seen": 47042465, "step": 2178, "time_per_iteration": 2.6934497356414795 }, { "auxiliary_loss_clip": 0.01170524, "auxiliary_loss_mlp": 0.01047895, "balance_loss_clip": 1.05729508, "balance_loss_mlp": 1.02705729, "epoch": 0.13100856756350518, "flos": 17311313963520.0, "grad_norm": 2.9251611149508276, "language_loss": 0.74291968, "learning_rate": 3.893925451517562e-06, "loss": 0.76510382, "num_input_tokens_seen": 47060370, "step": 2179, "time_per_iteration": 2.6111502647399902 }, { "auxiliary_loss_clip": 0.01128297, "auxiliary_loss_mlp": 0.01052407, "balance_loss_clip": 1.04917574, "balance_loss_mlp": 1.03184354, "epoch": 0.13106869081617314, "flos": 22200551562240.0, "grad_norm": 1.9805514150688242, "language_loss": 0.84366202, "learning_rate": 3.893800264659266e-06, "loss": 0.8654691, "num_input_tokens_seen": 47081415, "step": 2180, "time_per_iteration": 2.731229543685913 }, { "auxiliary_loss_clip": 0.01162028, "auxiliary_loss_mlp": 0.0105845, "balance_loss_clip": 1.05875921, "balance_loss_mlp": 1.03757644, "epoch": 0.13112881406884114, "flos": 21763123735680.0, "grad_norm": 1.8389866248015785, "language_loss": 0.89840436, "learning_rate": 3.8936750059878746e-06, "loss": 0.92060918, "num_input_tokens_seen": 47099860, "step": 2181, "time_per_iteration": 2.643890380859375 }, { "auxiliary_loss_clip": 0.01153771, "auxiliary_loss_mlp": 0.01051982, "balance_loss_clip": 1.05222976, "balance_loss_mlp": 1.03126323, "epoch": 0.1311889373215091, "flos": 23331091201920.0, "grad_norm": 2.117586475019142, "language_loss": 0.68813586, "learning_rate": 3.893549675508137e-06, "loss": 0.7101934, "num_input_tokens_seen": 47118540, "step": 2182, "time_per_iteration": 2.6198863983154297 }, { "auxiliary_loss_clip": 0.01123039, "auxiliary_loss_mlp": 0.01051411, "balance_loss_clip": 1.0502702, "balance_loss_mlp": 1.0292381, "epoch": 0.13124906057417707, "flos": 21467363149440.0, "grad_norm": 1.787500136217105, "language_loss": 0.78694725, "learning_rate": 3.893424273224806e-06, "loss": 0.8086918, "num_input_tokens_seen": 47136710, "step": 2183, "time_per_iteration": 2.715517520904541 }, { "auxiliary_loss_clip": 0.01169106, "auxiliary_loss_mlp": 0.01047098, "balance_loss_clip": 1.05452895, "balance_loss_mlp": 1.02586675, "epoch": 0.13130918382684503, "flos": 23255319461760.0, "grad_norm": 26.753588494231124, "language_loss": 0.85792655, "learning_rate": 3.893298799142636e-06, "loss": 0.88008863, "num_input_tokens_seen": 47157155, "step": 2184, "time_per_iteration": 2.632539987564087 }, { "auxiliary_loss_clip": 0.01138714, "auxiliary_loss_mlp": 0.01054657, "balance_loss_clip": 1.05349112, "balance_loss_mlp": 1.03230524, "epoch": 0.131369307079513, "flos": 20850274471680.0, "grad_norm": 2.50466124454056, "language_loss": 0.82703435, "learning_rate": 3.893173253266387e-06, "loss": 0.84896809, "num_input_tokens_seen": 47176820, "step": 2185, "time_per_iteration": 2.6809136867523193 }, { "auxiliary_loss_clip": 0.01144077, "auxiliary_loss_mlp": 0.01054121, "balance_loss_clip": 1.05262399, "balance_loss_mlp": 1.03236496, "epoch": 0.13142943033218096, "flos": 17858341163520.0, "grad_norm": 1.8949462712827352, "language_loss": 0.72956109, "learning_rate": 3.893047635600818e-06, "loss": 0.75154305, "num_input_tokens_seen": 47195855, "step": 2186, "time_per_iteration": 2.628096342086792 }, { "auxiliary_loss_clip": 0.01157778, "auxiliary_loss_mlp": 0.01050695, "balance_loss_clip": 1.05436552, "balance_loss_mlp": 1.02783096, "epoch": 0.13148955358484893, "flos": 20996035862400.0, "grad_norm": 1.9822444068613732, "language_loss": 0.80363685, "learning_rate": 3.892921946150693e-06, "loss": 0.82572162, "num_input_tokens_seen": 47214535, "step": 2187, "time_per_iteration": 2.762223720550537 }, { "auxiliary_loss_clip": 0.01027324, "auxiliary_loss_mlp": 0.0101023, "balance_loss_clip": 1.02364707, "balance_loss_mlp": 1.00792885, "epoch": 0.13154967683751692, "flos": 70172467580160.0, "grad_norm": 0.8471850380496847, "language_loss": 0.59082437, "learning_rate": 3.892796184920778e-06, "loss": 0.61119986, "num_input_tokens_seen": 47270300, "step": 2188, "time_per_iteration": 3.302457571029663 }, { "auxiliary_loss_clip": 0.01095126, "auxiliary_loss_mlp": 0.01059346, "balance_loss_clip": 1.04827487, "balance_loss_mlp": 1.03676724, "epoch": 0.1316098000901849, "flos": 20376145923840.0, "grad_norm": 1.7340345041340466, "language_loss": 0.74211109, "learning_rate": 3.892670351915842e-06, "loss": 0.76365584, "num_input_tokens_seen": 47290720, "step": 2189, "time_per_iteration": 2.7990496158599854 }, { "auxiliary_loss_clip": 0.01160124, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.05551052, "balance_loss_mlp": 1.02799821, "epoch": 0.13166992334285285, "flos": 23221132692480.0, "grad_norm": 1.8160574809616576, "language_loss": 0.73152113, "learning_rate": 3.892544447140657e-06, "loss": 0.75361335, "num_input_tokens_seen": 47311820, "step": 2190, "time_per_iteration": 2.6485326290130615 }, { "auxiliary_loss_clip": 0.01160351, "auxiliary_loss_mlp": 0.01058461, "balance_loss_clip": 1.05671644, "balance_loss_mlp": 1.03811169, "epoch": 0.13173004659552082, "flos": 23330947547520.0, "grad_norm": 1.8825588242208007, "language_loss": 0.74617779, "learning_rate": 3.892418470599996e-06, "loss": 0.76836598, "num_input_tokens_seen": 47331605, "step": 2191, "time_per_iteration": 2.644484281539917 }, { "auxiliary_loss_clip": 0.0112783, "auxiliary_loss_mlp": 0.01054712, "balance_loss_clip": 1.05129039, "balance_loss_mlp": 1.03356445, "epoch": 0.13179016984818878, "flos": 21251504367360.0, "grad_norm": 1.8823393822145031, "language_loss": 0.79093283, "learning_rate": 3.892292422298637e-06, "loss": 0.81275827, "num_input_tokens_seen": 47350455, "step": 2192, "time_per_iteration": 2.735225200653076 }, { "auxiliary_loss_clip": 0.0111282, "auxiliary_loss_mlp": 0.01051113, "balance_loss_clip": 1.04457211, "balance_loss_mlp": 1.02936912, "epoch": 0.13185029310085675, "flos": 17778690754560.0, "grad_norm": 1.7242105632860862, "language_loss": 0.85350716, "learning_rate": 3.892166302241361e-06, "loss": 0.87514639, "num_input_tokens_seen": 47368225, "step": 2193, "time_per_iteration": 4.262877941131592 }, { "auxiliary_loss_clip": 0.0104173, "auxiliary_loss_mlp": 0.01015651, "balance_loss_clip": 1.02609122, "balance_loss_mlp": 1.01280212, "epoch": 0.1319104163535247, "flos": 69851785933440.0, "grad_norm": 0.7746813180799224, "language_loss": 0.54112649, "learning_rate": 3.8920401104329475e-06, "loss": 0.56170022, "num_input_tokens_seen": 47427125, "step": 2194, "time_per_iteration": 6.223008394241333 }, { "auxiliary_loss_clip": 0.01168022, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.05420566, "balance_loss_mlp": 1.02828002, "epoch": 0.1319705396061927, "flos": 25193095401600.0, "grad_norm": 2.1079865649821925, "language_loss": 0.72433972, "learning_rate": 3.891913846878185e-06, "loss": 0.74650574, "num_input_tokens_seen": 47450275, "step": 2195, "time_per_iteration": 2.6357345581054688 }, { "auxiliary_loss_clip": 0.01136503, "auxiliary_loss_mlp": 0.00778731, "balance_loss_clip": 1.05176425, "balance_loss_mlp": 0.99996454, "epoch": 0.13203066285886067, "flos": 20740459616640.0, "grad_norm": 1.5737174748369949, "language_loss": 0.78126895, "learning_rate": 3.891787511581859e-06, "loss": 0.8004213, "num_input_tokens_seen": 47469155, "step": 2196, "time_per_iteration": 2.7118594646453857 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.05453539, "balance_loss_mlp": 1.03210831, "epoch": 0.13209078611152864, "flos": 22054395121920.0, "grad_norm": 1.9385650447291836, "language_loss": 0.74632496, "learning_rate": 3.89166110454876e-06, "loss": 0.76847541, "num_input_tokens_seen": 47488405, "step": 2197, "time_per_iteration": 4.270530939102173 }, { "auxiliary_loss_clip": 0.01173786, "auxiliary_loss_mlp": 0.01050846, "balance_loss_clip": 1.05440533, "balance_loss_mlp": 1.02947164, "epoch": 0.1321509093641966, "flos": 16284950743680.0, "grad_norm": 1.785688190112577, "language_loss": 0.79566747, "learning_rate": 3.891534625783685e-06, "loss": 0.81791383, "num_input_tokens_seen": 47505650, "step": 2198, "time_per_iteration": 2.6145474910736084 }, { "auxiliary_loss_clip": 0.01170264, "auxiliary_loss_mlp": 0.01057159, "balance_loss_clip": 1.05536175, "balance_loss_mlp": 1.03647637, "epoch": 0.13221103261686457, "flos": 16983018633600.0, "grad_norm": 2.56313218775589, "language_loss": 0.82932216, "learning_rate": 3.891408075291425e-06, "loss": 0.85159647, "num_input_tokens_seen": 47521540, "step": 2199, "time_per_iteration": 2.5715503692626953 }, { "auxiliary_loss_clip": 0.01122554, "auxiliary_loss_mlp": 0.01052148, "balance_loss_clip": 1.05047798, "balance_loss_mlp": 1.03045249, "epoch": 0.13227115586953253, "flos": 34233605677440.0, "grad_norm": 1.8710902505917797, "language_loss": 0.69579422, "learning_rate": 3.8912814530767826e-06, "loss": 0.71754128, "num_input_tokens_seen": 47543625, "step": 2200, "time_per_iteration": 2.8001365661621094 }, { "auxiliary_loss_clip": 0.01167798, "auxiliary_loss_mlp": 0.01058155, "balance_loss_clip": 1.05345917, "balance_loss_mlp": 1.03618431, "epoch": 0.13233127912220052, "flos": 20704656735360.0, "grad_norm": 1.647659287704997, "language_loss": 0.84624702, "learning_rate": 3.891154759144557e-06, "loss": 0.86850655, "num_input_tokens_seen": 47563740, "step": 2201, "time_per_iteration": 2.6485981941223145 }, { "auxiliary_loss_clip": 0.0117188, "auxiliary_loss_mlp": 0.01055627, "balance_loss_clip": 1.05427861, "balance_loss_mlp": 1.03431273, "epoch": 0.1323914023748685, "flos": 25805048434560.0, "grad_norm": 1.7446392584198542, "language_loss": 0.87088037, "learning_rate": 3.891027993499554e-06, "loss": 0.8931554, "num_input_tokens_seen": 47582655, "step": 2202, "time_per_iteration": 2.5921456813812256 }, { "auxiliary_loss_clip": 0.01139991, "auxiliary_loss_mlp": 0.01053413, "balance_loss_clip": 1.05299544, "balance_loss_mlp": 1.03267026, "epoch": 0.13245152562753645, "flos": 21251540280960.0, "grad_norm": 2.405254380671628, "language_loss": 0.72801507, "learning_rate": 3.89090115614658e-06, "loss": 0.7499491, "num_input_tokens_seen": 47600875, "step": 2203, "time_per_iteration": 2.6257405281066895 }, { "auxiliary_loss_clip": 0.01124508, "auxiliary_loss_mlp": 0.0105959, "balance_loss_clip": 1.05080879, "balance_loss_mlp": 1.03916979, "epoch": 0.13251164888020442, "flos": 26610955931520.0, "grad_norm": 2.044348475010678, "language_loss": 0.73170948, "learning_rate": 3.890774247090444e-06, "loss": 0.75355047, "num_input_tokens_seen": 47619250, "step": 2204, "time_per_iteration": 2.753830909729004 }, { "auxiliary_loss_clip": 0.01160826, "auxiliary_loss_mlp": 0.01054406, "balance_loss_clip": 1.05474758, "balance_loss_mlp": 1.03225708, "epoch": 0.13257177213287238, "flos": 29826541272960.0, "grad_norm": 2.094172729236468, "language_loss": 0.78377104, "learning_rate": 3.89064726633596e-06, "loss": 0.80592328, "num_input_tokens_seen": 47639445, "step": 2205, "time_per_iteration": 2.730682134628296 }, { "auxiliary_loss_clip": 0.01125154, "auxiliary_loss_mlp": 0.01048818, "balance_loss_clip": 1.04975629, "balance_loss_mlp": 1.02782559, "epoch": 0.13263189538554035, "flos": 21288456483840.0, "grad_norm": 1.8609089802832188, "language_loss": 0.78638101, "learning_rate": 3.890520213887941e-06, "loss": 0.80812073, "num_input_tokens_seen": 47658740, "step": 2206, "time_per_iteration": 2.691962718963623 }, { "auxiliary_loss_clip": 0.01124965, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.04958403, "balance_loss_mlp": 1.02649069, "epoch": 0.13269201863820831, "flos": 16874101618560.0, "grad_norm": 2.2777192787220066, "language_loss": 0.74672282, "learning_rate": 3.890393089751208e-06, "loss": 0.76843208, "num_input_tokens_seen": 47676880, "step": 2207, "time_per_iteration": 2.7062454223632812 }, { "auxiliary_loss_clip": 0.01143208, "auxiliary_loss_mlp": 0.01047941, "balance_loss_clip": 1.05257845, "balance_loss_mlp": 1.02672219, "epoch": 0.1327521418908763, "flos": 23768914078080.0, "grad_norm": 1.692212064021935, "language_loss": 0.84061795, "learning_rate": 3.890265893930578e-06, "loss": 0.8625294, "num_input_tokens_seen": 47696635, "step": 2208, "time_per_iteration": 2.687717914581299 }, { "auxiliary_loss_clip": 0.01152573, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.05847478, "balance_loss_mlp": 1.03411973, "epoch": 0.13281226514354427, "flos": 26505594362880.0, "grad_norm": 1.7032258459750478, "language_loss": 0.85587811, "learning_rate": 3.890138626430876e-06, "loss": 0.8779313, "num_input_tokens_seen": 47717760, "step": 2209, "time_per_iteration": 2.646015167236328 }, { "auxiliary_loss_clip": 0.01138084, "auxiliary_loss_mlp": 0.00778828, "balance_loss_clip": 1.05316806, "balance_loss_mlp": 1.00002563, "epoch": 0.13287238839621224, "flos": 24498762526080.0, "grad_norm": 2.237247968175465, "language_loss": 0.81797457, "learning_rate": 3.890011287256929e-06, "loss": 0.83714366, "num_input_tokens_seen": 47737685, "step": 2210, "time_per_iteration": 2.676262378692627 }, { "auxiliary_loss_clip": 0.0104445, "auxiliary_loss_mlp": 0.00757817, "balance_loss_clip": 1.03801322, "balance_loss_mlp": 1.00007725, "epoch": 0.1329325116488802, "flos": 67694344369920.0, "grad_norm": 0.7515252652740232, "language_loss": 0.58031559, "learning_rate": 3.889883876413563e-06, "loss": 0.59833825, "num_input_tokens_seen": 47802415, "step": 2211, "time_per_iteration": 3.3914146423339844 }, { "auxiliary_loss_clip": 0.01064712, "auxiliary_loss_mlp": 0.01012978, "balance_loss_clip": 1.04205871, "balance_loss_mlp": 1.01083231, "epoch": 0.13299263490154817, "flos": 72261894741120.0, "grad_norm": 0.8012428422082742, "language_loss": 0.55299425, "learning_rate": 3.889756393905611e-06, "loss": 0.57377112, "num_input_tokens_seen": 47871485, "step": 2212, "time_per_iteration": 3.2910914421081543 }, { "auxiliary_loss_clip": 0.01132433, "auxiliary_loss_mlp": 0.01054299, "balance_loss_clip": 1.05107963, "balance_loss_mlp": 1.0331986, "epoch": 0.13305275815421613, "flos": 17931275729280.0, "grad_norm": 2.484635795733661, "language_loss": 0.74228692, "learning_rate": 3.889628839737908e-06, "loss": 0.7641542, "num_input_tokens_seen": 47888315, "step": 2213, "time_per_iteration": 2.755777597427368 }, { "auxiliary_loss_clip": 0.01114671, "auxiliary_loss_mlp": 0.01051459, "balance_loss_clip": 1.04682255, "balance_loss_mlp": 1.03231359, "epoch": 0.13311288140688413, "flos": 22340889999360.0, "grad_norm": 1.850943077435394, "language_loss": 0.79699469, "learning_rate": 3.889501213915291e-06, "loss": 0.81865597, "num_input_tokens_seen": 47906600, "step": 2214, "time_per_iteration": 2.702603340148926 }, { "auxiliary_loss_clip": 0.01143494, "auxiliary_loss_mlp": 0.01052411, "balance_loss_clip": 1.05555344, "balance_loss_mlp": 1.03171659, "epoch": 0.1331730046595521, "flos": 31868888682240.0, "grad_norm": 1.8782588426913054, "language_loss": 0.69341159, "learning_rate": 3.889373516442597e-06, "loss": 0.71537066, "num_input_tokens_seen": 47927630, "step": 2215, "time_per_iteration": 2.769237518310547 }, { "auxiliary_loss_clip": 0.01167307, "auxiliary_loss_mlp": 0.01051423, "balance_loss_clip": 1.06098068, "balance_loss_mlp": 1.03132463, "epoch": 0.13323312791222006, "flos": 22566589107840.0, "grad_norm": 1.884566493826098, "language_loss": 0.81262428, "learning_rate": 3.889245747324671e-06, "loss": 0.83481157, "num_input_tokens_seen": 47947935, "step": 2216, "time_per_iteration": 2.7427120208740234 }, { "auxiliary_loss_clip": 0.01163681, "auxiliary_loss_mlp": 0.01056545, "balance_loss_clip": 1.06198788, "balance_loss_mlp": 1.03631544, "epoch": 0.13329325116488802, "flos": 15085319293440.0, "grad_norm": 3.783334161704178, "language_loss": 0.87299347, "learning_rate": 3.889117906566356e-06, "loss": 0.89519572, "num_input_tokens_seen": 47965515, "step": 2217, "time_per_iteration": 2.709527015686035 }, { "auxiliary_loss_clip": 0.01152703, "auxiliary_loss_mlp": 0.01056364, "balance_loss_clip": 1.06054497, "balance_loss_mlp": 1.0343225, "epoch": 0.133353374417556, "flos": 27453671890560.0, "grad_norm": 4.412823416345162, "language_loss": 0.73105222, "learning_rate": 3.888989994172501e-06, "loss": 0.75314289, "num_input_tokens_seen": 47985675, "step": 2218, "time_per_iteration": 2.697733163833618 }, { "auxiliary_loss_clip": 0.01129106, "auxiliary_loss_mlp": 0.01051151, "balance_loss_clip": 1.0535965, "balance_loss_mlp": 1.02993202, "epoch": 0.13341349767022395, "flos": 24094695456000.0, "grad_norm": 1.7935349411013712, "language_loss": 0.86911142, "learning_rate": 3.8888620101479565e-06, "loss": 0.89091408, "num_input_tokens_seen": 48004985, "step": 2219, "time_per_iteration": 2.7641642093658447 }, { "auxiliary_loss_clip": 0.01141172, "auxiliary_loss_mlp": 0.0106326, "balance_loss_clip": 1.05751657, "balance_loss_mlp": 1.04406714, "epoch": 0.13347362092289192, "flos": 24133335511680.0, "grad_norm": 1.8604531362737113, "language_loss": 0.77244747, "learning_rate": 3.888733954497574e-06, "loss": 0.79449183, "num_input_tokens_seen": 48024965, "step": 2220, "time_per_iteration": 2.732160806655884 }, { "auxiliary_loss_clip": 0.01146487, "auxiliary_loss_mlp": 0.01048662, "balance_loss_clip": 1.05399704, "balance_loss_mlp": 1.03001785, "epoch": 0.1335337441755599, "flos": 18436538390400.0, "grad_norm": 2.3004113327688955, "language_loss": 0.79467338, "learning_rate": 3.888605827226212e-06, "loss": 0.81662482, "num_input_tokens_seen": 48040890, "step": 2221, "time_per_iteration": 2.685612440109253 }, { "auxiliary_loss_clip": 0.01062777, "auxiliary_loss_mlp": 0.01021711, "balance_loss_clip": 1.03293467, "balance_loss_mlp": 1.0194701, "epoch": 0.13359386742822787, "flos": 50611997652480.0, "grad_norm": 0.9755051104211709, "language_loss": 0.68938822, "learning_rate": 3.8884776283387275e-06, "loss": 0.71023309, "num_input_tokens_seen": 48091855, "step": 2222, "time_per_iteration": 3.0336835384368896 }, { "auxiliary_loss_clip": 0.01130152, "auxiliary_loss_mlp": 0.01058574, "balance_loss_clip": 1.05544209, "balance_loss_mlp": 1.03940475, "epoch": 0.13365399068089584, "flos": 22778569221120.0, "grad_norm": 2.1295993667823416, "language_loss": 0.67389107, "learning_rate": 3.888349357839982e-06, "loss": 0.69577825, "num_input_tokens_seen": 48111350, "step": 2223, "time_per_iteration": 2.7134146690368652 }, { "auxiliary_loss_clip": 0.01161386, "auxiliary_loss_mlp": 0.01060571, "balance_loss_clip": 1.05785358, "balance_loss_mlp": 1.04010296, "epoch": 0.1337141139335638, "flos": 12531603911040.0, "grad_norm": 4.277142483609355, "language_loss": 0.82505226, "learning_rate": 3.88822101573484e-06, "loss": 0.84727186, "num_input_tokens_seen": 48129840, "step": 2224, "time_per_iteration": 2.608372926712036 }, { "auxiliary_loss_clip": 0.01173412, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.0573926, "balance_loss_mlp": 1.0290221, "epoch": 0.13377423718623177, "flos": 23038957889280.0, "grad_norm": 1.9890294619132924, "language_loss": 0.66270435, "learning_rate": 3.888092602028167e-06, "loss": 0.68493932, "num_input_tokens_seen": 48149240, "step": 2225, "time_per_iteration": 2.6304945945739746 }, { "auxiliary_loss_clip": 0.01153626, "auxiliary_loss_mlp": 0.01051637, "balance_loss_clip": 1.05233717, "balance_loss_mlp": 1.03180075, "epoch": 0.13383436043889974, "flos": 16216397637120.0, "grad_norm": 2.2915668787246997, "language_loss": 0.89469218, "learning_rate": 3.887964116724835e-06, "loss": 0.91674477, "num_input_tokens_seen": 48166330, "step": 2226, "time_per_iteration": 2.6002328395843506 }, { "auxiliary_loss_clip": 0.01150395, "auxiliary_loss_mlp": 0.01054296, "balance_loss_clip": 1.0549798, "balance_loss_mlp": 1.03423262, "epoch": 0.1338944836915677, "flos": 24279671520000.0, "grad_norm": 1.7271512115821777, "language_loss": 0.73209751, "learning_rate": 3.887835559829712e-06, "loss": 0.75414443, "num_input_tokens_seen": 48187600, "step": 2227, "time_per_iteration": 2.706193447113037 }, { "auxiliary_loss_clip": 0.01157707, "auxiliary_loss_mlp": 0.01047387, "balance_loss_clip": 1.05518484, "balance_loss_mlp": 1.02683568, "epoch": 0.1339546069442357, "flos": 17598742594560.0, "grad_norm": 2.848999829625599, "language_loss": 0.85160232, "learning_rate": 3.8877069313476764e-06, "loss": 0.87365323, "num_input_tokens_seen": 48204400, "step": 2228, "time_per_iteration": 2.689209222793579 }, { "auxiliary_loss_clip": 0.01132803, "auxiliary_loss_mlp": 0.01052829, "balance_loss_clip": 1.04935181, "balance_loss_mlp": 1.03126431, "epoch": 0.13401473019690366, "flos": 18990065952000.0, "grad_norm": 1.909679794697233, "language_loss": 0.81460214, "learning_rate": 3.8875782312836054e-06, "loss": 0.83645844, "num_input_tokens_seen": 48222180, "step": 2229, "time_per_iteration": 2.6380228996276855 }, { "auxiliary_loss_clip": 0.0110557, "auxiliary_loss_mlp": 0.01052684, "balance_loss_clip": 1.04774594, "balance_loss_mlp": 1.03233457, "epoch": 0.13407485344957162, "flos": 26943812288640.0, "grad_norm": 1.7464076089691416, "language_loss": 0.73822236, "learning_rate": 3.887449459642378e-06, "loss": 0.7598049, "num_input_tokens_seen": 48243245, "step": 2230, "time_per_iteration": 2.7332983016967773 }, { "auxiliary_loss_clip": 0.01125236, "auxiliary_loss_mlp": 0.01058977, "balance_loss_clip": 1.05213606, "balance_loss_mlp": 1.03890252, "epoch": 0.1341349767022396, "flos": 20339373375360.0, "grad_norm": 1.6827882777998602, "language_loss": 0.80133682, "learning_rate": 3.8873206164288785e-06, "loss": 0.82317901, "num_input_tokens_seen": 48262600, "step": 2231, "time_per_iteration": 2.6759045124053955 }, { "auxiliary_loss_clip": 0.01111387, "auxiliary_loss_mlp": 0.01057582, "balance_loss_clip": 1.04997492, "balance_loss_mlp": 1.03499198, "epoch": 0.13419509995490755, "flos": 29862020931840.0, "grad_norm": 1.746756846769887, "language_loss": 0.72152746, "learning_rate": 3.887191701647992e-06, "loss": 0.74321723, "num_input_tokens_seen": 48285075, "step": 2232, "time_per_iteration": 4.391890048980713 }, { "auxiliary_loss_clip": 0.0112104, "auxiliary_loss_mlp": 0.01051805, "balance_loss_clip": 1.0481019, "balance_loss_mlp": 1.03039551, "epoch": 0.13425522320757552, "flos": 26942986275840.0, "grad_norm": 2.4719586176391686, "language_loss": 0.65116024, "learning_rate": 3.8870627153046066e-06, "loss": 0.67288864, "num_input_tokens_seen": 48301285, "step": 2233, "time_per_iteration": 4.234508037567139 }, { "auxiliary_loss_clip": 0.01167005, "auxiliary_loss_mlp": 0.0104461, "balance_loss_clip": 1.05189967, "balance_loss_mlp": 1.02421367, "epoch": 0.1343153464602435, "flos": 15777281871360.0, "grad_norm": 2.4864430088666656, "language_loss": 0.80878961, "learning_rate": 3.886933657403615e-06, "loss": 0.8309058, "num_input_tokens_seen": 48317835, "step": 2234, "time_per_iteration": 4.175215005874634 }, { "auxiliary_loss_clip": 0.01140761, "auxiliary_loss_mlp": 0.01054039, "balance_loss_clip": 1.05052733, "balance_loss_mlp": 1.03268874, "epoch": 0.13437546971291148, "flos": 24314756129280.0, "grad_norm": 2.0569321713284827, "language_loss": 0.82114553, "learning_rate": 3.886804527949909e-06, "loss": 0.84309351, "num_input_tokens_seen": 48335670, "step": 2235, "time_per_iteration": 2.6588025093078613 }, { "auxiliary_loss_clip": 0.01149093, "auxiliary_loss_mlp": 0.01052015, "balance_loss_clip": 1.05040097, "balance_loss_mlp": 1.02983022, "epoch": 0.13443559296557944, "flos": 26650673395200.0, "grad_norm": 1.6363146905087136, "language_loss": 0.86092007, "learning_rate": 3.8866753269483864e-06, "loss": 0.88293117, "num_input_tokens_seen": 48357805, "step": 2236, "time_per_iteration": 4.349383592605591 }, { "auxiliary_loss_clip": 0.01166751, "auxiliary_loss_mlp": 0.01047925, "balance_loss_clip": 1.05288053, "balance_loss_mlp": 1.02724242, "epoch": 0.1344957162182474, "flos": 21796197183360.0, "grad_norm": 1.82135056053112, "language_loss": 0.77258497, "learning_rate": 3.886546054403946e-06, "loss": 0.79473174, "num_input_tokens_seen": 48377845, "step": 2237, "time_per_iteration": 2.6398766040802 }, { "auxiliary_loss_clip": 0.01145425, "auxiliary_loss_mlp": 0.01051006, "balance_loss_clip": 1.05016851, "balance_loss_mlp": 1.02919102, "epoch": 0.13455583947091537, "flos": 19865568049920.0, "grad_norm": 2.440947698046141, "language_loss": 0.78772336, "learning_rate": 3.886416710321491e-06, "loss": 0.80968761, "num_input_tokens_seen": 48394735, "step": 2238, "time_per_iteration": 2.6556923389434814 }, { "auxiliary_loss_clip": 0.01141594, "auxiliary_loss_mlp": 0.01050085, "balance_loss_clip": 1.05123293, "balance_loss_mlp": 1.02878201, "epoch": 0.13461596272358334, "flos": 30846835094400.0, "grad_norm": 2.9136729194949735, "language_loss": 0.68486369, "learning_rate": 3.886287294705924e-06, "loss": 0.70678043, "num_input_tokens_seen": 48414200, "step": 2239, "time_per_iteration": 2.6778814792633057 }, { "auxiliary_loss_clip": 0.01147129, "auxiliary_loss_mlp": 0.01052633, "balance_loss_clip": 1.0515976, "balance_loss_mlp": 1.03197384, "epoch": 0.1346760859762513, "flos": 12494436312960.0, "grad_norm": 2.3763106012672925, "language_loss": 0.81277847, "learning_rate": 3.8861578075621555e-06, "loss": 0.8347761, "num_input_tokens_seen": 48431065, "step": 2240, "time_per_iteration": 2.5920939445495605 }, { "auxiliary_loss_clip": 0.01107793, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.04459488, "balance_loss_mlp": 1.02884459, "epoch": 0.1347362092289193, "flos": 21836022387840.0, "grad_norm": 1.7269080191231387, "language_loss": 0.77183759, "learning_rate": 3.886028248895093e-06, "loss": 0.79341465, "num_input_tokens_seen": 48450335, "step": 2241, "time_per_iteration": 2.7224419116973877 }, { "auxiliary_loss_clip": 0.0116331, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.05439126, "balance_loss_mlp": 1.02324009, "epoch": 0.13479633248158726, "flos": 23509459163520.0, "grad_norm": 2.0305903786470743, "language_loss": 0.83062387, "learning_rate": 3.88589861870965e-06, "loss": 0.85267115, "num_input_tokens_seen": 48468555, "step": 2242, "time_per_iteration": 2.5794169902801514 }, { "auxiliary_loss_clip": 0.01170048, "auxiliary_loss_mlp": 0.01056609, "balance_loss_clip": 1.05504107, "balance_loss_mlp": 1.03469825, "epoch": 0.13485645573425523, "flos": 29344332165120.0, "grad_norm": 2.465549548535016, "language_loss": 0.6498239, "learning_rate": 3.885768917010744e-06, "loss": 0.67209053, "num_input_tokens_seen": 48488515, "step": 2243, "time_per_iteration": 2.6709110736846924 }, { "auxiliary_loss_clip": 0.01125086, "auxiliary_loss_mlp": 0.01046786, "balance_loss_clip": 1.04593956, "balance_loss_mlp": 1.02618706, "epoch": 0.1349165789869232, "flos": 28037112503040.0, "grad_norm": 1.7770524512670738, "language_loss": 0.72633034, "learning_rate": 3.8856391438032895e-06, "loss": 0.74804902, "num_input_tokens_seen": 48510515, "step": 2244, "time_per_iteration": 2.713803768157959 }, { "auxiliary_loss_clip": 0.0115377, "auxiliary_loss_mlp": 0.0105148, "balance_loss_clip": 1.05312431, "balance_loss_mlp": 1.03209639, "epoch": 0.13497670223959116, "flos": 22853730430080.0, "grad_norm": 1.7564166456764931, "language_loss": 0.86023217, "learning_rate": 3.88550929909221e-06, "loss": 0.88228464, "num_input_tokens_seen": 48529940, "step": 2245, "time_per_iteration": 2.626560926437378 }, { "auxiliary_loss_clip": 0.01149467, "auxiliary_loss_mlp": 0.0105327, "balance_loss_clip": 1.05035663, "balance_loss_mlp": 1.03346968, "epoch": 0.13503682549225912, "flos": 16504580453760.0, "grad_norm": 1.7861449859595755, "language_loss": 0.78912753, "learning_rate": 3.88537938288243e-06, "loss": 0.8111549, "num_input_tokens_seen": 48548190, "step": 2246, "time_per_iteration": 2.6543703079223633 }, { "auxiliary_loss_clip": 0.010304, "auxiliary_loss_mlp": 0.01015407, "balance_loss_clip": 1.03666449, "balance_loss_mlp": 1.01285601, "epoch": 0.1350969487449271, "flos": 70756303242240.0, "grad_norm": 0.7509256694227144, "language_loss": 0.6054731, "learning_rate": 3.885249395178874e-06, "loss": 0.62593114, "num_input_tokens_seen": 48613165, "step": 2247, "time_per_iteration": 3.3349809646606445 }, { "auxiliary_loss_clip": 0.01162017, "auxiliary_loss_mlp": 0.01056869, "balance_loss_clip": 1.05492628, "balance_loss_mlp": 1.03470767, "epoch": 0.13515707199759508, "flos": 23075981832960.0, "grad_norm": 2.562042993856578, "language_loss": 0.80841738, "learning_rate": 3.885119335986473e-06, "loss": 0.83060622, "num_input_tokens_seen": 48631705, "step": 2248, "time_per_iteration": 2.6279287338256836 }, { "auxiliary_loss_clip": 0.0114073, "auxiliary_loss_mlp": 0.01049128, "balance_loss_clip": 1.05086231, "balance_loss_mlp": 1.03054309, "epoch": 0.13521719525026304, "flos": 23186371305600.0, "grad_norm": 1.9247838227480492, "language_loss": 0.77108699, "learning_rate": 3.884989205310157e-06, "loss": 0.79298556, "num_input_tokens_seen": 48649740, "step": 2249, "time_per_iteration": 2.7100210189819336 }, { "auxiliary_loss_clip": 0.0112733, "auxiliary_loss_mlp": 0.01057649, "balance_loss_clip": 1.05325472, "balance_loss_mlp": 1.03863478, "epoch": 0.135277318502931, "flos": 24790931752320.0, "grad_norm": 1.7403695434994237, "language_loss": 0.84457541, "learning_rate": 3.884859003154862e-06, "loss": 0.86642522, "num_input_tokens_seen": 48671565, "step": 2250, "time_per_iteration": 2.789350986480713 }, { "auxiliary_loss_clip": 0.01155547, "auxiliary_loss_mlp": 0.0105348, "balance_loss_clip": 1.05310512, "balance_loss_mlp": 1.03243995, "epoch": 0.13533744175559898, "flos": 21908525990400.0, "grad_norm": 3.018154510939524, "language_loss": 0.81796515, "learning_rate": 3.884728729525524e-06, "loss": 0.84005541, "num_input_tokens_seen": 48690425, "step": 2251, "time_per_iteration": 2.685617208480835 }, { "auxiliary_loss_clip": 0.01165433, "auxiliary_loss_mlp": 0.01060257, "balance_loss_clip": 1.05235004, "balance_loss_mlp": 1.03888273, "epoch": 0.13539756500826694, "flos": 21211643249280.0, "grad_norm": 1.7680273527580506, "language_loss": 0.86173487, "learning_rate": 3.884598384427084e-06, "loss": 0.88399172, "num_input_tokens_seen": 48707505, "step": 2252, "time_per_iteration": 2.597219467163086 }, { "auxiliary_loss_clip": 0.01052296, "auxiliary_loss_mlp": 0.01018557, "balance_loss_clip": 1.02446079, "balance_loss_mlp": 1.01632786, "epoch": 0.1354576882609349, "flos": 63242103634560.0, "grad_norm": 0.8028920055572067, "language_loss": 0.61837333, "learning_rate": 3.884467967864485e-06, "loss": 0.6390819, "num_input_tokens_seen": 48775895, "step": 2253, "time_per_iteration": 3.25115704536438 }, { "auxiliary_loss_clip": 0.01155107, "auxiliary_loss_mlp": 0.01055639, "balance_loss_clip": 1.0539906, "balance_loss_mlp": 1.03587449, "epoch": 0.1355178115136029, "flos": 25483037984640.0, "grad_norm": 1.6376691715964824, "language_loss": 0.89441288, "learning_rate": 3.884337479842671e-06, "loss": 0.91652036, "num_input_tokens_seen": 48798370, "step": 2254, "time_per_iteration": 2.6803932189941406 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01063066, "balance_loss_clip": 1.04506016, "balance_loss_mlp": 1.03872383, "epoch": 0.13557793476627086, "flos": 21616967295360.0, "grad_norm": 2.1104776784573787, "language_loss": 0.84626925, "learning_rate": 3.884206920366591e-06, "loss": 0.86821771, "num_input_tokens_seen": 48817955, "step": 2255, "time_per_iteration": 2.7074074745178223 }, { "auxiliary_loss_clip": 0.01165481, "auxiliary_loss_mlp": 0.01058458, "balance_loss_clip": 1.05211091, "balance_loss_mlp": 1.03767991, "epoch": 0.13563805801893883, "flos": 24928253447040.0, "grad_norm": 4.791676738707355, "language_loss": 0.74684238, "learning_rate": 3.884076289441196e-06, "loss": 0.76908177, "num_input_tokens_seen": 48836330, "step": 2256, "time_per_iteration": 2.590178966522217 }, { "auxiliary_loss_clip": 0.01127027, "auxiliary_loss_mlp": 0.01054317, "balance_loss_clip": 1.04977024, "balance_loss_mlp": 1.03338361, "epoch": 0.1356981812716068, "flos": 14750272206720.0, "grad_norm": 5.890843360804152, "language_loss": 0.8309083, "learning_rate": 3.88394558707144e-06, "loss": 0.85272169, "num_input_tokens_seen": 48851890, "step": 2257, "time_per_iteration": 2.642096519470215 }, { "auxiliary_loss_clip": 0.0114984, "auxiliary_loss_mlp": 0.00780177, "balance_loss_clip": 1.05128407, "balance_loss_mlp": 1.00013828, "epoch": 0.13575830452427476, "flos": 11108571822720.0, "grad_norm": 2.1957250492246505, "language_loss": 0.82045269, "learning_rate": 3.883814813262277e-06, "loss": 0.83975297, "num_input_tokens_seen": 48865510, "step": 2258, "time_per_iteration": 2.6279473304748535 }, { "auxiliary_loss_clip": 0.01155515, "auxiliary_loss_mlp": 0.01054519, "balance_loss_clip": 1.05172098, "balance_loss_mlp": 1.03152323, "epoch": 0.13581842777694272, "flos": 17960290940160.0, "grad_norm": 2.6364031487830464, "language_loss": 0.82694167, "learning_rate": 3.883683968018669e-06, "loss": 0.849042, "num_input_tokens_seen": 48882360, "step": 2259, "time_per_iteration": 2.677804708480835 }, { "auxiliary_loss_clip": 0.01127201, "auxiliary_loss_mlp": 0.01054646, "balance_loss_clip": 1.0495683, "balance_loss_mlp": 1.03547728, "epoch": 0.1358785510296107, "flos": 22857142222080.0, "grad_norm": 2.0790748617118853, "language_loss": 0.73916006, "learning_rate": 3.8835530513455755e-06, "loss": 0.76097858, "num_input_tokens_seen": 48902700, "step": 2260, "time_per_iteration": 2.7416799068450928 }, { "auxiliary_loss_clip": 0.01144177, "auxiliary_loss_mlp": 0.01056881, "balance_loss_clip": 1.05196047, "balance_loss_mlp": 1.03691387, "epoch": 0.13593867428227868, "flos": 25739404329600.0, "grad_norm": 3.546593987683097, "language_loss": 0.74799728, "learning_rate": 3.883422063247961e-06, "loss": 0.77000785, "num_input_tokens_seen": 48922525, "step": 2261, "time_per_iteration": 2.675342559814453 }, { "auxiliary_loss_clip": 0.01170469, "auxiliary_loss_mlp": 0.01050986, "balance_loss_clip": 1.05486035, "balance_loss_mlp": 1.03043413, "epoch": 0.13599879753494665, "flos": 31249214225280.0, "grad_norm": 2.967396076139427, "language_loss": 0.63602281, "learning_rate": 3.883291003730794e-06, "loss": 0.65823734, "num_input_tokens_seen": 48942510, "step": 2262, "time_per_iteration": 2.660538911819458 }, { "auxiliary_loss_clip": 0.01148004, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.0516696, "balance_loss_mlp": 1.03216195, "epoch": 0.1360589207876146, "flos": 23915034604800.0, "grad_norm": 2.301949377353301, "language_loss": 0.81810403, "learning_rate": 3.883159872799043e-06, "loss": 0.84010524, "num_input_tokens_seen": 48962625, "step": 2263, "time_per_iteration": 2.840043783187866 }, { "auxiliary_loss_clip": 0.01098888, "auxiliary_loss_mlp": 0.01064302, "balance_loss_clip": 1.04875195, "balance_loss_mlp": 1.0410558, "epoch": 0.13611904404028258, "flos": 19974197756160.0, "grad_norm": 1.7561035968690553, "language_loss": 0.87737143, "learning_rate": 3.8830286704576815e-06, "loss": 0.89900339, "num_input_tokens_seen": 48982525, "step": 2264, "time_per_iteration": 2.784648895263672 }, { "auxiliary_loss_clip": 0.01157618, "auxiliary_loss_mlp": 0.01049521, "balance_loss_clip": 1.05161715, "balance_loss_mlp": 1.02709746, "epoch": 0.13617916729295054, "flos": 15340644144000.0, "grad_norm": 3.151792845640157, "language_loss": 0.7115528, "learning_rate": 3.882897396711683e-06, "loss": 0.7336241, "num_input_tokens_seen": 48997605, "step": 2265, "time_per_iteration": 2.6108245849609375 }, { "auxiliary_loss_clip": 0.01111831, "auxiliary_loss_mlp": 0.01042545, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.02256525, "epoch": 0.1362392905456185, "flos": 27451445247360.0, "grad_norm": 4.918827494175735, "language_loss": 0.6671263, "learning_rate": 3.882766051566027e-06, "loss": 0.68867004, "num_input_tokens_seen": 49018535, "step": 2266, "time_per_iteration": 2.7810373306274414 }, { "auxiliary_loss_clip": 0.01127539, "auxiliary_loss_mlp": 0.01057589, "balance_loss_clip": 1.05683684, "balance_loss_mlp": 1.03739524, "epoch": 0.1362994137982865, "flos": 25009017177600.0, "grad_norm": 1.707924588861666, "language_loss": 0.7634865, "learning_rate": 3.882634635025694e-06, "loss": 0.78533769, "num_input_tokens_seen": 49038865, "step": 2267, "time_per_iteration": 2.7682721614837646 }, { "auxiliary_loss_clip": 0.01133448, "auxiliary_loss_mlp": 0.01048207, "balance_loss_clip": 1.04668903, "balance_loss_mlp": 1.02641535, "epoch": 0.13635953705095447, "flos": 20303031790080.0, "grad_norm": 2.9531688260339934, "language_loss": 0.81653506, "learning_rate": 3.882503147095667e-06, "loss": 0.83835161, "num_input_tokens_seen": 49058010, "step": 2268, "time_per_iteration": 2.645081043243408 }, { "auxiliary_loss_clip": 0.01155147, "auxiliary_loss_mlp": 0.01048448, "balance_loss_clip": 1.05424881, "balance_loss_mlp": 1.02738333, "epoch": 0.13641966030362243, "flos": 31358418549120.0, "grad_norm": 1.9923150848418427, "language_loss": 0.75975174, "learning_rate": 3.882371587780931e-06, "loss": 0.78178769, "num_input_tokens_seen": 49080330, "step": 2269, "time_per_iteration": 2.6764814853668213 }, { "auxiliary_loss_clip": 0.0113465, "auxiliary_loss_mlp": 0.01049702, "balance_loss_clip": 1.04941857, "balance_loss_mlp": 1.02844727, "epoch": 0.1364797835562904, "flos": 20478095700480.0, "grad_norm": 2.1475090354855473, "language_loss": 0.81328762, "learning_rate": 3.882239957086477e-06, "loss": 0.83513117, "num_input_tokens_seen": 49097035, "step": 2270, "time_per_iteration": 2.6801655292510986 }, { "auxiliary_loss_clip": 0.01142111, "auxiliary_loss_mlp": 0.010594, "balance_loss_clip": 1.04989171, "balance_loss_mlp": 1.03773928, "epoch": 0.13653990680895836, "flos": 13078343802240.0, "grad_norm": 3.2227070482893976, "language_loss": 0.75812757, "learning_rate": 3.882108255017295e-06, "loss": 0.78014266, "num_input_tokens_seen": 49113945, "step": 2271, "time_per_iteration": 4.197805166244507 }, { "auxiliary_loss_clip": 0.01156913, "auxiliary_loss_mlp": 0.01061846, "balance_loss_clip": 1.05097795, "balance_loss_mlp": 1.03921962, "epoch": 0.13660003006162633, "flos": 16946712961920.0, "grad_norm": 2.2800716885469754, "language_loss": 0.80251753, "learning_rate": 3.881976481578379e-06, "loss": 0.82470512, "num_input_tokens_seen": 49132855, "step": 2272, "time_per_iteration": 4.1461029052734375 }, { "auxiliary_loss_clip": 0.01055091, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.02539539, "balance_loss_mlp": 1.04001904, "epoch": 0.1366601533142943, "flos": 68682749892480.0, "grad_norm": 0.7097054685047118, "language_loss": 0.60739923, "learning_rate": 3.8818446367747255e-06, "loss": 0.62837708, "num_input_tokens_seen": 49198310, "step": 2273, "time_per_iteration": 4.731219530105591 }, { "auxiliary_loss_clip": 0.01165514, "auxiliary_loss_mlp": 0.00780474, "balance_loss_clip": 1.0523783, "balance_loss_mlp": 1.00008452, "epoch": 0.13672027656696228, "flos": 19244241567360.0, "grad_norm": 2.4844725334882583, "language_loss": 0.77506429, "learning_rate": 3.881712720611336e-06, "loss": 0.79452413, "num_input_tokens_seen": 49217250, "step": 2274, "time_per_iteration": 2.7122738361358643 }, { "auxiliary_loss_clip": 0.01154937, "auxiliary_loss_mlp": 0.01054542, "balance_loss_clip": 1.05082417, "balance_loss_mlp": 1.03271496, "epoch": 0.13678039981963025, "flos": 24534924543360.0, "grad_norm": 2.391437383339344, "language_loss": 0.78256011, "learning_rate": 3.881580733093211e-06, "loss": 0.8046549, "num_input_tokens_seen": 49236615, "step": 2275, "time_per_iteration": 2.6674444675445557 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.01044634, "balance_loss_clip": 1.05220842, "balance_loss_mlp": 1.02449977, "epoch": 0.13684052307229821, "flos": 15669334523520.0, "grad_norm": 2.271072834476717, "language_loss": 0.81682789, "learning_rate": 3.881448674225356e-06, "loss": 0.83882004, "num_input_tokens_seen": 49253935, "step": 2276, "time_per_iteration": 4.202202558517456 }, { "auxiliary_loss_clip": 0.01164941, "auxiliary_loss_mlp": 0.01060078, "balance_loss_clip": 1.05228245, "balance_loss_mlp": 1.03604531, "epoch": 0.13690064632496618, "flos": 28364689560960.0, "grad_norm": 5.063053962589045, "language_loss": 0.69948691, "learning_rate": 3.881316544012779e-06, "loss": 0.72173715, "num_input_tokens_seen": 49273605, "step": 2277, "time_per_iteration": 2.708591938018799 }, { "auxiliary_loss_clip": 0.01160044, "auxiliary_loss_mlp": 0.00780297, "balance_loss_clip": 1.05169702, "balance_loss_mlp": 1.00017083, "epoch": 0.13696076957763414, "flos": 23404779953280.0, "grad_norm": 2.062701620585305, "language_loss": 0.80197465, "learning_rate": 3.88118434246049e-06, "loss": 0.82137805, "num_input_tokens_seen": 49291785, "step": 2278, "time_per_iteration": 2.6916158199310303 }, { "auxiliary_loss_clip": 0.01159146, "auxiliary_loss_mlp": 0.01060686, "balance_loss_clip": 1.05954766, "balance_loss_mlp": 1.03925228, "epoch": 0.1370208928303021, "flos": 37196595601920.0, "grad_norm": 7.088344486179519, "language_loss": 0.75048816, "learning_rate": 3.881052069573502e-06, "loss": 0.77268648, "num_input_tokens_seen": 49311405, "step": 2279, "time_per_iteration": 2.7316977977752686 }, { "auxiliary_loss_clip": 0.01101952, "auxiliary_loss_mlp": 0.01066685, "balance_loss_clip": 1.04605758, "balance_loss_mlp": 1.04485774, "epoch": 0.13708101608297008, "flos": 26976311118720.0, "grad_norm": 2.5293116992138223, "language_loss": 0.76743513, "learning_rate": 3.880919725356831e-06, "loss": 0.78912151, "num_input_tokens_seen": 49331835, "step": 2280, "time_per_iteration": 2.813720941543579 }, { "auxiliary_loss_clip": 0.01108594, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.04457331, "balance_loss_mlp": 1.04022956, "epoch": 0.13714113933563807, "flos": 32556864850560.0, "grad_norm": 2.0597640944890325, "language_loss": 0.79657966, "learning_rate": 3.880787309815496e-06, "loss": 0.81827366, "num_input_tokens_seen": 49352290, "step": 2281, "time_per_iteration": 2.8325345516204834 }, { "auxiliary_loss_clip": 0.0117656, "auxiliary_loss_mlp": 0.0107773, "balance_loss_clip": 1.05715084, "balance_loss_mlp": 1.05671358, "epoch": 0.13720126258830603, "flos": 16101267569280.0, "grad_norm": 2.0769142230572877, "language_loss": 0.83383757, "learning_rate": 3.880654822954518e-06, "loss": 0.85638046, "num_input_tokens_seen": 49370285, "step": 2282, "time_per_iteration": 2.5988755226135254 }, { "auxiliary_loss_clip": 0.01142098, "auxiliary_loss_mlp": 0.01075909, "balance_loss_clip": 1.04898703, "balance_loss_mlp": 1.05583453, "epoch": 0.137261385840974, "flos": 18953544798720.0, "grad_norm": 1.5269487193470777, "language_loss": 0.73526621, "learning_rate": 3.8805222647789195e-06, "loss": 0.75744629, "num_input_tokens_seen": 49389610, "step": 2283, "time_per_iteration": 2.7099714279174805 }, { "auxiliary_loss_clip": 0.01160178, "auxiliary_loss_mlp": 0.01062577, "balance_loss_clip": 1.05577087, "balance_loss_mlp": 1.04173923, "epoch": 0.13732150909364196, "flos": 23295360147840.0, "grad_norm": 2.2306012559941455, "language_loss": 0.83934438, "learning_rate": 3.880389635293729e-06, "loss": 0.86157191, "num_input_tokens_seen": 49408390, "step": 2284, "time_per_iteration": 2.7315831184387207 }, { "auxiliary_loss_clip": 0.01151427, "auxiliary_loss_mlp": 0.01070288, "balance_loss_clip": 1.05204272, "balance_loss_mlp": 1.04779351, "epoch": 0.13738163234630993, "flos": 29351263489920.0, "grad_norm": 2.0900141273659223, "language_loss": 0.7557056, "learning_rate": 3.880256934503974e-06, "loss": 0.77792281, "num_input_tokens_seen": 49427725, "step": 2285, "time_per_iteration": 2.7257747650146484 }, { "auxiliary_loss_clip": 0.01144078, "auxiliary_loss_mlp": 0.01064539, "balance_loss_clip": 1.05233073, "balance_loss_mlp": 1.04392731, "epoch": 0.1374417555989779, "flos": 26651319840000.0, "grad_norm": 2.727019945657865, "language_loss": 0.74521589, "learning_rate": 3.880124162414689e-06, "loss": 0.76730204, "num_input_tokens_seen": 49449000, "step": 2286, "time_per_iteration": 2.742582082748413 }, { "auxiliary_loss_clip": 0.0112541, "auxiliary_loss_mlp": 0.01059198, "balance_loss_clip": 1.04906356, "balance_loss_mlp": 1.03659606, "epoch": 0.1375018788516459, "flos": 28403401443840.0, "grad_norm": 2.2168449035378357, "language_loss": 0.86683542, "learning_rate": 3.879991319030908e-06, "loss": 0.88868147, "num_input_tokens_seen": 49468360, "step": 2287, "time_per_iteration": 2.802088499069214 }, { "auxiliary_loss_clip": 0.01124712, "auxiliary_loss_mlp": 0.01064517, "balance_loss_clip": 1.04803944, "balance_loss_mlp": 1.04207003, "epoch": 0.13756200210431385, "flos": 37413783187200.0, "grad_norm": 2.0592152854463106, "language_loss": 0.68410838, "learning_rate": 3.879858404357666e-06, "loss": 0.70600063, "num_input_tokens_seen": 49493450, "step": 2288, "time_per_iteration": 2.861175537109375 }, { "auxiliary_loss_clip": 0.01112106, "auxiliary_loss_mlp": 0.01071262, "balance_loss_clip": 1.05062151, "balance_loss_mlp": 1.04666936, "epoch": 0.13762212535698182, "flos": 22711021695360.0, "grad_norm": 2.3933568244149357, "language_loss": 0.87090456, "learning_rate": 3.879725418400005e-06, "loss": 0.89273822, "num_input_tokens_seen": 49511220, "step": 2289, "time_per_iteration": 2.7185773849487305 }, { "auxiliary_loss_clip": 0.01130193, "auxiliary_loss_mlp": 0.00781167, "balance_loss_clip": 1.0480957, "balance_loss_mlp": 1.00019848, "epoch": 0.13768224860964978, "flos": 23952130375680.0, "grad_norm": 1.8106848287624444, "language_loss": 0.74668044, "learning_rate": 3.879592361162969e-06, "loss": 0.76579404, "num_input_tokens_seen": 49529820, "step": 2290, "time_per_iteration": 2.6751222610473633 }, { "auxiliary_loss_clip": 0.01039657, "auxiliary_loss_mlp": 0.01081332, "balance_loss_clip": 1.03094769, "balance_loss_mlp": 1.07881641, "epoch": 0.13774237186231775, "flos": 63590438753280.0, "grad_norm": 0.7179159366671727, "language_loss": 0.51597112, "learning_rate": 3.8794592326516015e-06, "loss": 0.53718102, "num_input_tokens_seen": 49595325, "step": 2291, "time_per_iteration": 3.2823359966278076 }, { "auxiliary_loss_clip": 0.01157406, "auxiliary_loss_mlp": 0.01052846, "balance_loss_clip": 1.05224037, "balance_loss_mlp": 1.03123331, "epoch": 0.1378024951149857, "flos": 24279456038400.0, "grad_norm": 1.9326408617769533, "language_loss": 0.71273667, "learning_rate": 3.879326032870952e-06, "loss": 0.7348392, "num_input_tokens_seen": 49615850, "step": 2292, "time_per_iteration": 2.74045729637146 }, { "auxiliary_loss_clip": 0.01156871, "auxiliary_loss_mlp": 0.01049315, "balance_loss_clip": 1.05427122, "balance_loss_mlp": 1.02931166, "epoch": 0.13786261836765368, "flos": 14021537080320.0, "grad_norm": 6.592759889378346, "language_loss": 0.8047784, "learning_rate": 3.879192761826071e-06, "loss": 0.82684022, "num_input_tokens_seen": 49631860, "step": 2293, "time_per_iteration": 2.587576389312744 }, { "auxiliary_loss_clip": 0.0115787, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.0554558, "balance_loss_mlp": 1.02921653, "epoch": 0.13792274162032167, "flos": 28878679226880.0, "grad_norm": 1.9082895606463517, "language_loss": 0.78440171, "learning_rate": 3.879059419522011e-06, "loss": 0.80647767, "num_input_tokens_seen": 49652145, "step": 2294, "time_per_iteration": 2.7152793407440186 }, { "auxiliary_loss_clip": 0.01126374, "auxiliary_loss_mlp": 0.01050648, "balance_loss_clip": 1.05281758, "balance_loss_mlp": 1.03104973, "epoch": 0.13798286487298964, "flos": 21141150808320.0, "grad_norm": 1.991103290125302, "language_loss": 0.80339509, "learning_rate": 3.878926005963831e-06, "loss": 0.82516527, "num_input_tokens_seen": 49669880, "step": 2295, "time_per_iteration": 2.7026021480560303 }, { "auxiliary_loss_clip": 0.01154693, "auxiliary_loss_mlp": 0.01052186, "balance_loss_clip": 1.05239046, "balance_loss_mlp": 1.03102624, "epoch": 0.1380429881256576, "flos": 22487477402880.0, "grad_norm": 1.7450624966187134, "language_loss": 0.78661883, "learning_rate": 3.878792521156588e-06, "loss": 0.80868757, "num_input_tokens_seen": 49687255, "step": 2296, "time_per_iteration": 2.566929340362549 }, { "auxiliary_loss_clip": 0.01153425, "auxiliary_loss_mlp": 0.01069343, "balance_loss_clip": 1.05437231, "balance_loss_mlp": 1.04811132, "epoch": 0.13810311137832557, "flos": 21393674398080.0, "grad_norm": 1.7434096141785573, "language_loss": 0.78663194, "learning_rate": 3.8786589651053446e-06, "loss": 0.80885959, "num_input_tokens_seen": 49706650, "step": 2297, "time_per_iteration": 2.6254489421844482 }, { "auxiliary_loss_clip": 0.01110905, "auxiliary_loss_mlp": 0.01059754, "balance_loss_clip": 1.05296302, "balance_loss_mlp": 1.03871369, "epoch": 0.13816323463099353, "flos": 25989844930560.0, "grad_norm": 1.929043788877404, "language_loss": 0.69199705, "learning_rate": 3.878525337815164e-06, "loss": 0.71370363, "num_input_tokens_seen": 49725715, "step": 2298, "time_per_iteration": 2.791301965713501 }, { "auxiliary_loss_clip": 0.01137772, "auxiliary_loss_mlp": 0.01061768, "balance_loss_clip": 1.0517292, "balance_loss_mlp": 1.04059684, "epoch": 0.1382233578836615, "flos": 19244313394560.0, "grad_norm": 1.7910922430646712, "language_loss": 0.86382294, "learning_rate": 3.878391639291116e-06, "loss": 0.88581836, "num_input_tokens_seen": 49744710, "step": 2299, "time_per_iteration": 2.6075453758239746 }, { "auxiliary_loss_clip": 0.01166817, "auxiliary_loss_mlp": 0.01054863, "balance_loss_clip": 1.05378175, "balance_loss_mlp": 1.03292871, "epoch": 0.1382834811363295, "flos": 25666290195840.0, "grad_norm": 2.2378660690879606, "language_loss": 0.75468475, "learning_rate": 3.878257869538267e-06, "loss": 0.77690154, "num_input_tokens_seen": 49764300, "step": 2300, "time_per_iteration": 2.663328170776367 }, { "auxiliary_loss_clip": 0.01130608, "auxiliary_loss_mlp": 0.01047248, "balance_loss_clip": 1.05274105, "balance_loss_mlp": 1.02664876, "epoch": 0.13834360438899745, "flos": 19784193788160.0, "grad_norm": 2.5571861214345963, "language_loss": 0.82463622, "learning_rate": 3.878124028561692e-06, "loss": 0.8464148, "num_input_tokens_seen": 49778380, "step": 2301, "time_per_iteration": 2.6705129146575928 }, { "auxiliary_loss_clip": 0.0113862, "auxiliary_loss_mlp": 0.00777879, "balance_loss_clip": 1.05323792, "balance_loss_mlp": 1.00021625, "epoch": 0.13840372764166542, "flos": 26651858544000.0, "grad_norm": 1.9612043619218924, "language_loss": 0.85957694, "learning_rate": 3.877990116366466e-06, "loss": 0.87874192, "num_input_tokens_seen": 49797460, "step": 2302, "time_per_iteration": 2.679797410964966 }, { "auxiliary_loss_clip": 0.01059341, "auxiliary_loss_mlp": 0.01025212, "balance_loss_clip": 1.03226125, "balance_loss_mlp": 1.02244604, "epoch": 0.13846385089433338, "flos": 70510998286080.0, "grad_norm": 0.7598813547967705, "language_loss": 0.65591633, "learning_rate": 3.877856132957667e-06, "loss": 0.67676187, "num_input_tokens_seen": 49868005, "step": 2303, "time_per_iteration": 3.3249399662017822 }, { "auxiliary_loss_clip": 0.01151443, "auxiliary_loss_mlp": 0.01046478, "balance_loss_clip": 1.05337632, "balance_loss_mlp": 1.02655792, "epoch": 0.13852397414700135, "flos": 17348732956800.0, "grad_norm": 3.141207945865242, "language_loss": 0.78663635, "learning_rate": 3.877722078340374e-06, "loss": 0.80861557, "num_input_tokens_seen": 49885825, "step": 2304, "time_per_iteration": 2.7364001274108887 }, { "auxiliary_loss_clip": 0.01157514, "auxiliary_loss_mlp": 0.01043253, "balance_loss_clip": 1.05566275, "balance_loss_mlp": 1.02385736, "epoch": 0.13858409739966931, "flos": 21543781334400.0, "grad_norm": 1.7487365854034607, "language_loss": 0.77559888, "learning_rate": 3.877587952519672e-06, "loss": 0.79760659, "num_input_tokens_seen": 49905975, "step": 2305, "time_per_iteration": 2.7814202308654785 }, { "auxiliary_loss_clip": 0.01074766, "auxiliary_loss_mlp": 0.01055718, "balance_loss_clip": 1.04160607, "balance_loss_mlp": 1.03473723, "epoch": 0.13864422065233728, "flos": 21579907438080.0, "grad_norm": 1.8207477060355044, "language_loss": 0.87737936, "learning_rate": 3.877453755500647e-06, "loss": 0.89868426, "num_input_tokens_seen": 49925800, "step": 2306, "time_per_iteration": 2.917616605758667 }, { "auxiliary_loss_clip": 0.01064826, "auxiliary_loss_mlp": 0.0101208, "balance_loss_clip": 1.02692199, "balance_loss_mlp": 1.0094099, "epoch": 0.13870434390500527, "flos": 53371156872960.0, "grad_norm": 0.8728538231155298, "language_loss": 0.59008431, "learning_rate": 3.877319487288387e-06, "loss": 0.61085337, "num_input_tokens_seen": 49977620, "step": 2307, "time_per_iteration": 3.4345149993896484 }, { "auxiliary_loss_clip": 0.01169624, "auxiliary_loss_mlp": 0.00778134, "balance_loss_clip": 1.05528641, "balance_loss_mlp": 1.00021303, "epoch": 0.13876446715767324, "flos": 22565906749440.0, "grad_norm": 1.8467673932802395, "language_loss": 0.79483795, "learning_rate": 3.877185147887984e-06, "loss": 0.81431556, "num_input_tokens_seen": 49996650, "step": 2308, "time_per_iteration": 2.7137296199798584 }, { "auxiliary_loss_clip": 0.01131024, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.05118585, "balance_loss_mlp": 1.03054297, "epoch": 0.1388245904103412, "flos": 20705231352960.0, "grad_norm": 2.352128383160346, "language_loss": 0.78101134, "learning_rate": 3.877050737304533e-06, "loss": 0.80282485, "num_input_tokens_seen": 50015640, "step": 2309, "time_per_iteration": 2.9259471893310547 }, { "auxiliary_loss_clip": 0.01128109, "auxiliary_loss_mlp": 0.01057348, "balance_loss_clip": 1.04979932, "balance_loss_mlp": 1.03620028, "epoch": 0.13888471366300917, "flos": 20554729367040.0, "grad_norm": 3.914796791761399, "language_loss": 0.68133545, "learning_rate": 3.876916255543129e-06, "loss": 0.70318997, "num_input_tokens_seen": 50033500, "step": 2310, "time_per_iteration": 4.27877140045166 }, { "auxiliary_loss_clip": 0.01164985, "auxiliary_loss_mlp": 0.01062516, "balance_loss_clip": 1.05356944, "balance_loss_mlp": 1.04021168, "epoch": 0.13894483691567713, "flos": 13838033473920.0, "grad_norm": 1.934954545600412, "language_loss": 0.84295756, "learning_rate": 3.8767817026088725e-06, "loss": 0.86523259, "num_input_tokens_seen": 50050075, "step": 2311, "time_per_iteration": 2.5612359046936035 }, { "auxiliary_loss_clip": 0.01173749, "auxiliary_loss_mlp": 0.01055474, "balance_loss_clip": 1.05752683, "balance_loss_mlp": 1.0350771, "epoch": 0.1390049601683451, "flos": 28031186759040.0, "grad_norm": 2.9213009430481143, "language_loss": 0.82358992, "learning_rate": 3.876647078506866e-06, "loss": 0.84588212, "num_input_tokens_seen": 50070080, "step": 2312, "time_per_iteration": 5.737139701843262 }, { "auxiliary_loss_clip": 0.01129781, "auxiliary_loss_mlp": 0.00778347, "balance_loss_clip": 1.05464363, "balance_loss_mlp": 1.00023031, "epoch": 0.13906508342101306, "flos": 26756860976640.0, "grad_norm": 2.109799495913242, "language_loss": 0.86732674, "learning_rate": 3.876512383242215e-06, "loss": 0.88640809, "num_input_tokens_seen": 50090040, "step": 2313, "time_per_iteration": 2.8402304649353027 }, { "auxiliary_loss_clip": 0.01168088, "auxiliary_loss_mlp": 0.01061738, "balance_loss_clip": 1.05670547, "balance_loss_mlp": 1.04115057, "epoch": 0.13912520667368106, "flos": 24535104111360.0, "grad_norm": 1.784990717237318, "language_loss": 0.79935932, "learning_rate": 3.876377616820024e-06, "loss": 0.8216576, "num_input_tokens_seen": 50110595, "step": 2314, "time_per_iteration": 2.683448076248169 }, { "auxiliary_loss_clip": 0.01124732, "auxiliary_loss_mlp": 0.01061041, "balance_loss_clip": 1.04845023, "balance_loss_mlp": 1.04103708, "epoch": 0.13918532992634902, "flos": 19383215287680.0, "grad_norm": 2.585875079553688, "language_loss": 0.85367405, "learning_rate": 3.876242779245409e-06, "loss": 0.87553179, "num_input_tokens_seen": 50125430, "step": 2315, "time_per_iteration": 4.332594394683838 }, { "auxiliary_loss_clip": 0.01156122, "auxiliary_loss_mlp": 0.01058532, "balance_loss_clip": 1.05397022, "balance_loss_mlp": 1.0372889, "epoch": 0.139245453179017, "flos": 21323756574720.0, "grad_norm": 2.333331492160627, "language_loss": 0.77170396, "learning_rate": 3.876107870523477e-06, "loss": 0.79385042, "num_input_tokens_seen": 50144120, "step": 2316, "time_per_iteration": 2.654604911804199 }, { "auxiliary_loss_clip": 0.01163967, "auxiliary_loss_mlp": 0.00780027, "balance_loss_clip": 1.05353916, "balance_loss_mlp": 1.00024533, "epoch": 0.13930557643168495, "flos": 19500607912320.0, "grad_norm": 2.1485284032262086, "language_loss": 0.76820493, "learning_rate": 3.875972890659349e-06, "loss": 0.78764486, "num_input_tokens_seen": 50162500, "step": 2317, "time_per_iteration": 2.6501235961914062 }, { "auxiliary_loss_clip": 0.01144052, "auxiliary_loss_mlp": 0.01061042, "balance_loss_clip": 1.05156648, "balance_loss_mlp": 1.04074025, "epoch": 0.13936569968435292, "flos": 25410821690880.0, "grad_norm": 1.7797832869421444, "language_loss": 0.80185997, "learning_rate": 3.875837839658139e-06, "loss": 0.82391089, "num_input_tokens_seen": 50182415, "step": 2318, "time_per_iteration": 2.7097995281219482 }, { "auxiliary_loss_clip": 0.01049096, "auxiliary_loss_mlp": 0.01048478, "balance_loss_clip": 1.03358936, "balance_loss_mlp": 1.04518783, "epoch": 0.13942582293702088, "flos": 70771063731840.0, "grad_norm": 0.854553938374386, "language_loss": 0.59004617, "learning_rate": 3.87570271752497e-06, "loss": 0.61102188, "num_input_tokens_seen": 50245160, "step": 2319, "time_per_iteration": 3.2631640434265137 }, { "auxiliary_loss_clip": 0.0111484, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.04508984, "balance_loss_mlp": 1.03437412, "epoch": 0.13948594618968888, "flos": 35590885920000.0, "grad_norm": 2.3313836691947722, "language_loss": 0.64993447, "learning_rate": 3.875567524264967e-06, "loss": 0.67163646, "num_input_tokens_seen": 50268215, "step": 2320, "time_per_iteration": 2.8668782711029053 }, { "auxiliary_loss_clip": 0.01096421, "auxiliary_loss_mlp": 0.01056652, "balance_loss_clip": 1.04400086, "balance_loss_mlp": 1.03521848, "epoch": 0.13954606944235684, "flos": 21105204272640.0, "grad_norm": 2.285151015895421, "language_loss": 0.70708811, "learning_rate": 3.875432259883256e-06, "loss": 0.72861886, "num_input_tokens_seen": 50288575, "step": 2321, "time_per_iteration": 2.8273603916168213 }, { "auxiliary_loss_clip": 0.01117698, "auxiliary_loss_mlp": 0.01061754, "balance_loss_clip": 1.04603076, "balance_loss_mlp": 1.03698206, "epoch": 0.1396061926950248, "flos": 25044425009280.0, "grad_norm": 1.7926270181208543, "language_loss": 0.85931206, "learning_rate": 3.875296924384965e-06, "loss": 0.88110662, "num_input_tokens_seen": 50308735, "step": 2322, "time_per_iteration": 2.833807945251465 }, { "auxiliary_loss_clip": 0.01120545, "auxiliary_loss_mlp": 0.01055036, "balance_loss_clip": 1.04616976, "balance_loss_mlp": 1.03568828, "epoch": 0.13966631594769277, "flos": 37634023428480.0, "grad_norm": 1.5963293576391182, "language_loss": 0.67159557, "learning_rate": 3.875161517775226e-06, "loss": 0.69335139, "num_input_tokens_seen": 50331025, "step": 2323, "time_per_iteration": 2.875265121459961 }, { "auxiliary_loss_clip": 0.01127992, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04900301, "balance_loss_mlp": 1.03432369, "epoch": 0.13972643920036074, "flos": 16690993061760.0, "grad_norm": 2.0757452253793485, "language_loss": 0.88878977, "learning_rate": 3.875026040059175e-06, "loss": 0.9106214, "num_input_tokens_seen": 50349725, "step": 2324, "time_per_iteration": 2.6841063499450684 }, { "auxiliary_loss_clip": 0.01154799, "auxiliary_loss_mlp": 0.01056834, "balance_loss_clip": 1.05145955, "balance_loss_mlp": 1.03541231, "epoch": 0.1397865624530287, "flos": 23331055288320.0, "grad_norm": 2.8450589371660526, "language_loss": 0.70621002, "learning_rate": 3.8748904912419485e-06, "loss": 0.72832638, "num_input_tokens_seen": 50367965, "step": 2325, "time_per_iteration": 2.694218397140503 }, { "auxiliary_loss_clip": 0.01134393, "auxiliary_loss_mlp": 0.00778751, "balance_loss_clip": 1.05273592, "balance_loss_mlp": 1.00028229, "epoch": 0.13984668570569667, "flos": 22778317825920.0, "grad_norm": 2.230299294128946, "language_loss": 0.81657004, "learning_rate": 3.874754871328688e-06, "loss": 0.83570141, "num_input_tokens_seen": 50385605, "step": 2326, "time_per_iteration": 2.715306282043457 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.02745473, "epoch": 0.13990680895836466, "flos": 19464553635840.0, "grad_norm": 1.729713540462037, "language_loss": 0.89241689, "learning_rate": 3.874619180324534e-06, "loss": 0.91438794, "num_input_tokens_seen": 50403985, "step": 2327, "time_per_iteration": 2.679626941680908 }, { "auxiliary_loss_clip": 0.01119996, "auxiliary_loss_mlp": 0.01057397, "balance_loss_clip": 1.04873121, "balance_loss_mlp": 1.0352242, "epoch": 0.13996693221103262, "flos": 20303283185280.0, "grad_norm": 2.9217951598838363, "language_loss": 0.84760427, "learning_rate": 3.874483418234632e-06, "loss": 0.86937821, "num_input_tokens_seen": 50421590, "step": 2328, "time_per_iteration": 2.7277352809906006 }, { "auxiliary_loss_clip": 0.01151775, "auxiliary_loss_mlp": 0.0104443, "balance_loss_clip": 1.05300856, "balance_loss_mlp": 1.02421176, "epoch": 0.1400270554637006, "flos": 26617707688320.0, "grad_norm": 1.6116398320348613, "language_loss": 0.73835862, "learning_rate": 3.874347585064131e-06, "loss": 0.76032066, "num_input_tokens_seen": 50443945, "step": 2329, "time_per_iteration": 2.6911025047302246 }, { "auxiliary_loss_clip": 0.01153137, "auxiliary_loss_mlp": 0.01046755, "balance_loss_clip": 1.05254042, "balance_loss_mlp": 1.02644169, "epoch": 0.14008717871636855, "flos": 19391475415680.0, "grad_norm": 2.565670250114109, "language_loss": 0.78373277, "learning_rate": 3.874211680818183e-06, "loss": 0.80573165, "num_input_tokens_seen": 50462065, "step": 2330, "time_per_iteration": 2.703225612640381 }, { "auxiliary_loss_clip": 0.01144455, "auxiliary_loss_mlp": 0.01046085, "balance_loss_clip": 1.05247569, "balance_loss_mlp": 1.02692819, "epoch": 0.14014730196903652, "flos": 15304266645120.0, "grad_norm": 2.2215524337864143, "language_loss": 0.72115719, "learning_rate": 3.87407570550194e-06, "loss": 0.74306256, "num_input_tokens_seen": 50479565, "step": 2331, "time_per_iteration": 2.7044217586517334 }, { "auxiliary_loss_clip": 0.01159691, "auxiliary_loss_mlp": 0.01051771, "balance_loss_clip": 1.0558939, "balance_loss_mlp": 1.03234017, "epoch": 0.14020742522170448, "flos": 14939701557120.0, "grad_norm": 1.5806705357110964, "language_loss": 0.72634697, "learning_rate": 3.873939659120557e-06, "loss": 0.7484616, "num_input_tokens_seen": 50497305, "step": 2332, "time_per_iteration": 2.647564649581909 }, { "auxiliary_loss_clip": 0.01063058, "auxiliary_loss_mlp": 0.01022564, "balance_loss_clip": 1.03391051, "balance_loss_mlp": 1.01944101, "epoch": 0.14026754847437245, "flos": 48824580044160.0, "grad_norm": 0.8445516092095569, "language_loss": 0.56185365, "learning_rate": 3.873803541679196e-06, "loss": 0.58270991, "num_input_tokens_seen": 50549735, "step": 2333, "time_per_iteration": 3.038390636444092 }, { "auxiliary_loss_clip": 0.01127793, "auxiliary_loss_mlp": 0.01045888, "balance_loss_clip": 1.05246043, "balance_loss_mlp": 1.02587318, "epoch": 0.14032767172704044, "flos": 25773267876480.0, "grad_norm": 1.7702774265545234, "language_loss": 0.82728767, "learning_rate": 3.873667353183016e-06, "loss": 0.84902453, "num_input_tokens_seen": 50570100, "step": 2334, "time_per_iteration": 2.7205803394317627 }, { "auxiliary_loss_clip": 0.01129244, "auxiliary_loss_mlp": 0.01044663, "balance_loss_clip": 1.05110407, "balance_loss_mlp": 1.02593565, "epoch": 0.1403877949797084, "flos": 21216312017280.0, "grad_norm": 1.7790720657464538, "language_loss": 0.80958998, "learning_rate": 3.8735310936371825e-06, "loss": 0.83132899, "num_input_tokens_seen": 50589185, "step": 2335, "time_per_iteration": 2.7844314575195312 }, { "auxiliary_loss_clip": 0.01108373, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.04802513, "balance_loss_mlp": 1.02160311, "epoch": 0.14044791823237637, "flos": 22747973811840.0, "grad_norm": 1.739505291070366, "language_loss": 0.81987065, "learning_rate": 3.873394763046862e-06, "loss": 0.84139174, "num_input_tokens_seen": 50609645, "step": 2336, "time_per_iteration": 2.7787351608276367 }, { "auxiliary_loss_clip": 0.01150445, "auxiliary_loss_mlp": 0.01046319, "balance_loss_clip": 1.05603921, "balance_loss_mlp": 1.02709103, "epoch": 0.14050804148504434, "flos": 22964443125120.0, "grad_norm": 1.7584048007565314, "language_loss": 0.80606967, "learning_rate": 3.873258361417225e-06, "loss": 0.82803738, "num_input_tokens_seen": 50628385, "step": 2337, "time_per_iteration": 2.6119275093078613 }, { "auxiliary_loss_clip": 0.01150898, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.05363941, "balance_loss_mlp": 1.03202438, "epoch": 0.1405681647377123, "flos": 22200336080640.0, "grad_norm": 2.383737065589604, "language_loss": 0.78994334, "learning_rate": 3.873121888753442e-06, "loss": 0.81196302, "num_input_tokens_seen": 50647260, "step": 2338, "time_per_iteration": 2.672427177429199 }, { "auxiliary_loss_clip": 0.01158377, "auxiliary_loss_mlp": 0.01050168, "balance_loss_clip": 1.05894089, "balance_loss_mlp": 1.02919865, "epoch": 0.14062828799038027, "flos": 23732787974400.0, "grad_norm": 2.117725014058833, "language_loss": 0.79766536, "learning_rate": 3.87298534506069e-06, "loss": 0.81975079, "num_input_tokens_seen": 50666130, "step": 2339, "time_per_iteration": 2.68635892868042 }, { "auxiliary_loss_clip": 0.01097095, "auxiliary_loss_mlp": 0.01065327, "balance_loss_clip": 1.04686952, "balance_loss_mlp": 1.04463232, "epoch": 0.14068841124304826, "flos": 39202493685120.0, "grad_norm": 2.0269377249156793, "language_loss": 0.65632963, "learning_rate": 3.872848730344146e-06, "loss": 0.67795384, "num_input_tokens_seen": 50687440, "step": 2340, "time_per_iteration": 2.9426286220550537 }, { "auxiliary_loss_clip": 0.0114865, "auxiliary_loss_mlp": 0.01050723, "balance_loss_clip": 1.05418086, "balance_loss_mlp": 1.0310297, "epoch": 0.14074853449571623, "flos": 20192283181440.0, "grad_norm": 2.8518792803213917, "language_loss": 0.78760445, "learning_rate": 3.87271204460899e-06, "loss": 0.80959821, "num_input_tokens_seen": 50704030, "step": 2341, "time_per_iteration": 2.8814899921417236 }, { "auxiliary_loss_clip": 0.01162758, "auxiliary_loss_mlp": 0.01057334, "balance_loss_clip": 1.0554986, "balance_loss_mlp": 1.03876162, "epoch": 0.1408086577483842, "flos": 18405871153920.0, "grad_norm": 2.2693198584224454, "language_loss": 0.80322361, "learning_rate": 3.8725752878604066e-06, "loss": 0.82542449, "num_input_tokens_seen": 50723305, "step": 2342, "time_per_iteration": 2.604814291000366 }, { "auxiliary_loss_clip": 0.01152048, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.05776191, "balance_loss_mlp": 1.03858757, "epoch": 0.14086878100105216, "flos": 25264593423360.0, "grad_norm": 2.4727499245104343, "language_loss": 0.77686632, "learning_rate": 3.87243846010358e-06, "loss": 0.79895234, "num_input_tokens_seen": 50743270, "step": 2343, "time_per_iteration": 2.676823854446411 }, { "auxiliary_loss_clip": 0.0105659, "auxiliary_loss_mlp": 0.01037584, "balance_loss_clip": 1.03650093, "balance_loss_mlp": 1.03438878, "epoch": 0.14092890425372012, "flos": 65978388869760.0, "grad_norm": 0.8521752699932517, "language_loss": 0.61553669, "learning_rate": 3.872301561343699e-06, "loss": 0.63647842, "num_input_tokens_seen": 50802710, "step": 2344, "time_per_iteration": 3.156792402267456 }, { "auxiliary_loss_clip": 0.01147637, "auxiliary_loss_mlp": 0.01049362, "balance_loss_clip": 1.05167484, "balance_loss_mlp": 1.03121877, "epoch": 0.1409890275063881, "flos": 23694973931520.0, "grad_norm": 1.558783678159347, "language_loss": 0.64331692, "learning_rate": 3.872164591585956e-06, "loss": 0.6652869, "num_input_tokens_seen": 50822625, "step": 2345, "time_per_iteration": 2.654100179672241 }, { "auxiliary_loss_clip": 0.01154879, "auxiliary_loss_mlp": 0.0104633, "balance_loss_clip": 1.05009735, "balance_loss_mlp": 1.02562308, "epoch": 0.14104915075905605, "flos": 23623152687360.0, "grad_norm": 2.26337760563351, "language_loss": 0.73892581, "learning_rate": 3.8720275508355435e-06, "loss": 0.76093793, "num_input_tokens_seen": 50842330, "step": 2346, "time_per_iteration": 2.7032830715179443 }, { "auxiliary_loss_clip": 0.0115447, "auxiliary_loss_mlp": 0.01048793, "balance_loss_clip": 1.0572027, "balance_loss_mlp": 1.02929008, "epoch": 0.14110927401172405, "flos": 20595165102720.0, "grad_norm": 1.7675181118684058, "language_loss": 0.7727294, "learning_rate": 3.8718904390976585e-06, "loss": 0.79476202, "num_input_tokens_seen": 50861035, "step": 2347, "time_per_iteration": 2.678647518157959 }, { "auxiliary_loss_clip": 0.01164131, "auxiliary_loss_mlp": 0.01052088, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.03370619, "epoch": 0.141169397264392, "flos": 28548049512960.0, "grad_norm": 2.592464695784388, "language_loss": 0.76753062, "learning_rate": 3.8717532563775e-06, "loss": 0.78969282, "num_input_tokens_seen": 50880105, "step": 2348, "time_per_iteration": 2.7450597286224365 }, { "auxiliary_loss_clip": 0.01147264, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.05267334, "balance_loss_mlp": 1.02295136, "epoch": 0.14122952051705998, "flos": 17092258871040.0, "grad_norm": 1.8617784303344698, "language_loss": 0.86794335, "learning_rate": 3.871616002680272e-06, "loss": 0.8898412, "num_input_tokens_seen": 50897720, "step": 2349, "time_per_iteration": 2.662508964538574 }, { "auxiliary_loss_clip": 0.01150971, "auxiliary_loss_mlp": 0.01048616, "balance_loss_clip": 1.05632985, "balance_loss_mlp": 1.02897048, "epoch": 0.14128964376972794, "flos": 28946801370240.0, "grad_norm": 2.650060051711467, "language_loss": 0.88758218, "learning_rate": 3.871478678011177e-06, "loss": 0.90957808, "num_input_tokens_seen": 50918385, "step": 2350, "time_per_iteration": 4.1697962284088135 }, { "auxiliary_loss_clip": 0.01142704, "auxiliary_loss_mlp": 0.01045134, "balance_loss_clip": 1.05369377, "balance_loss_mlp": 1.02442729, "epoch": 0.1413497670223959, "flos": 18989778643200.0, "grad_norm": 1.801090232061166, "language_loss": 0.8094542, "learning_rate": 3.871341282375423e-06, "loss": 0.83133256, "num_input_tokens_seen": 50938270, "step": 2351, "time_per_iteration": 2.6769907474517822 }, { "auxiliary_loss_clip": 0.01149546, "auxiliary_loss_mlp": 0.01040141, "balance_loss_clip": 1.05100775, "balance_loss_mlp": 1.02096045, "epoch": 0.14140989027506387, "flos": 29862236413440.0, "grad_norm": 2.590933181784672, "language_loss": 0.82796198, "learning_rate": 3.871203815778219e-06, "loss": 0.84985888, "num_input_tokens_seen": 50958155, "step": 2352, "time_per_iteration": 5.713203430175781 }, { "auxiliary_loss_clip": 0.01063742, "auxiliary_loss_mlp": 0.01009803, "balance_loss_clip": 1.03462291, "balance_loss_mlp": 1.0060122, "epoch": 0.14147001352773186, "flos": 62079532041600.0, "grad_norm": 0.9118003008214054, "language_loss": 0.61876011, "learning_rate": 3.87106627822478e-06, "loss": 0.63949555, "num_input_tokens_seen": 51020705, "step": 2353, "time_per_iteration": 3.1698319911956787 }, { "auxiliary_loss_clip": 0.01134069, "auxiliary_loss_mlp": 0.01049094, "balance_loss_clip": 1.0536828, "balance_loss_mlp": 1.03039002, "epoch": 0.14153013678039983, "flos": 22017514832640.0, "grad_norm": 1.5909284402791886, "language_loss": 0.87075388, "learning_rate": 3.8709286697203196e-06, "loss": 0.89258552, "num_input_tokens_seen": 51039995, "step": 2354, "time_per_iteration": 2.6781272888183594 }, { "auxiliary_loss_clip": 0.01124592, "auxiliary_loss_mlp": 0.0104583, "balance_loss_clip": 1.0527302, "balance_loss_mlp": 1.02562428, "epoch": 0.1415902600330678, "flos": 19720093968000.0, "grad_norm": 2.035812967878614, "language_loss": 0.74701214, "learning_rate": 3.870790990270057e-06, "loss": 0.76871634, "num_input_tokens_seen": 51059075, "step": 2355, "time_per_iteration": 4.464852571487427 }, { "auxiliary_loss_clip": 0.01062228, "auxiliary_loss_mlp": 0.01003337, "balance_loss_clip": 1.03320074, "balance_loss_mlp": 0.99947417, "epoch": 0.14165038328573576, "flos": 65900929190400.0, "grad_norm": 0.6801443738216844, "language_loss": 0.51819825, "learning_rate": 3.870653239879212e-06, "loss": 0.53885388, "num_input_tokens_seen": 51120380, "step": 2356, "time_per_iteration": 3.094026803970337 }, { "auxiliary_loss_clip": 0.01165635, "auxiliary_loss_mlp": 0.01057535, "balance_loss_clip": 1.05662966, "balance_loss_mlp": 1.0379492, "epoch": 0.14171050653840372, "flos": 12130158533760.0, "grad_norm": 1.9928903491175036, "language_loss": 0.70598352, "learning_rate": 3.8705154185530095e-06, "loss": 0.72821522, "num_input_tokens_seen": 51136950, "step": 2357, "time_per_iteration": 2.569486141204834 }, { "auxiliary_loss_clip": 0.01117022, "auxiliary_loss_mlp": 0.01054948, "balance_loss_clip": 1.04706419, "balance_loss_mlp": 1.0355413, "epoch": 0.1417706297910717, "flos": 20412487509120.0, "grad_norm": 2.1046358800035234, "language_loss": 0.82020235, "learning_rate": 3.870377526296674e-06, "loss": 0.84192204, "num_input_tokens_seen": 51155175, "step": 2358, "time_per_iteration": 2.719344139099121 }, { "auxiliary_loss_clip": 0.01145283, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.05257189, "balance_loss_mlp": 1.02932954, "epoch": 0.14183075304373965, "flos": 22380607463040.0, "grad_norm": 2.2336131404929787, "language_loss": 0.71575904, "learning_rate": 3.870239563115436e-06, "loss": 0.73771417, "num_input_tokens_seen": 51174500, "step": 2359, "time_per_iteration": 2.6914820671081543 }, { "auxiliary_loss_clip": 0.0111529, "auxiliary_loss_mlp": 0.007787, "balance_loss_clip": 1.0526464, "balance_loss_mlp": 1.00033379, "epoch": 0.14189087629640765, "flos": 21580913018880.0, "grad_norm": 2.4314273775499906, "language_loss": 0.7541784, "learning_rate": 3.870101529014526e-06, "loss": 0.77311832, "num_input_tokens_seen": 51194270, "step": 2360, "time_per_iteration": 2.803493022918701 }, { "auxiliary_loss_clip": 0.01108644, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.0491271, "balance_loss_mlp": 1.03136814, "epoch": 0.1419509995490756, "flos": 20008564093440.0, "grad_norm": 2.374719540518049, "language_loss": 0.81920552, "learning_rate": 3.869963423999178e-06, "loss": 0.84082878, "num_input_tokens_seen": 51211850, "step": 2361, "time_per_iteration": 2.8039920330047607 }, { "auxiliary_loss_clip": 0.0115065, "auxiliary_loss_mlp": 0.01057946, "balance_loss_clip": 1.05230403, "balance_loss_mlp": 1.03802609, "epoch": 0.14201112280174358, "flos": 31941464112000.0, "grad_norm": 1.9397979109407166, "language_loss": 0.74081504, "learning_rate": 3.86982524807463e-06, "loss": 0.76290095, "num_input_tokens_seen": 51233545, "step": 2362, "time_per_iteration": 2.7272114753723145 }, { "auxiliary_loss_clip": 0.0115354, "auxiliary_loss_mlp": 0.01048321, "balance_loss_clip": 1.05355787, "balance_loss_mlp": 1.02861547, "epoch": 0.14207124605441154, "flos": 41464147582080.0, "grad_norm": 1.7489521991344694, "language_loss": 0.74221587, "learning_rate": 3.869687001246122e-06, "loss": 0.76423442, "num_input_tokens_seen": 51257615, "step": 2363, "time_per_iteration": 2.789802312850952 }, { "auxiliary_loss_clip": 0.01128802, "auxiliary_loss_mlp": 0.0105205, "balance_loss_clip": 1.04769099, "balance_loss_mlp": 1.03180885, "epoch": 0.1421313693070795, "flos": 31905086613120.0, "grad_norm": 1.7832713632097879, "language_loss": 0.73034167, "learning_rate": 3.8695486835188946e-06, "loss": 0.75215018, "num_input_tokens_seen": 51279645, "step": 2364, "time_per_iteration": 2.8508312702178955 }, { "auxiliary_loss_clip": 0.01142769, "auxiliary_loss_mlp": 0.01049829, "balance_loss_clip": 1.05160844, "balance_loss_mlp": 1.03207827, "epoch": 0.14219149255974747, "flos": 26871165031680.0, "grad_norm": 1.875477198706701, "language_loss": 0.90395916, "learning_rate": 3.869410294898195e-06, "loss": 0.92588514, "num_input_tokens_seen": 51299775, "step": 2365, "time_per_iteration": 2.6807806491851807 }, { "auxiliary_loss_clip": 0.01127252, "auxiliary_loss_mlp": 0.01054912, "balance_loss_clip": 1.04759967, "balance_loss_mlp": 1.03394318, "epoch": 0.14225161581241544, "flos": 27454426076160.0, "grad_norm": 1.719218863067841, "language_loss": 0.65305161, "learning_rate": 3.869271835389268e-06, "loss": 0.67487329, "num_input_tokens_seen": 51319430, "step": 2366, "time_per_iteration": 2.7293641567230225 }, { "auxiliary_loss_clip": 0.01143576, "auxiliary_loss_mlp": 0.01051629, "balance_loss_clip": 1.05218709, "balance_loss_mlp": 1.03058839, "epoch": 0.14231173906508343, "flos": 10561436881920.0, "grad_norm": 2.3740196514966256, "language_loss": 0.80331928, "learning_rate": 3.8691333049973665e-06, "loss": 0.82527137, "num_input_tokens_seen": 51336045, "step": 2367, "time_per_iteration": 2.67529296875 }, { "auxiliary_loss_clip": 0.01138517, "auxiliary_loss_mlp": 0.01062653, "balance_loss_clip": 1.05117869, "balance_loss_mlp": 1.0402534, "epoch": 0.1423718623177514, "flos": 28360882719360.0, "grad_norm": 2.0081973718426283, "language_loss": 0.82346755, "learning_rate": 3.868994703727742e-06, "loss": 0.84547925, "num_input_tokens_seen": 51357030, "step": 2368, "time_per_iteration": 2.7447288036346436 }, { "auxiliary_loss_clip": 0.01122755, "auxiliary_loss_mlp": 0.01052229, "balance_loss_clip": 1.05180073, "balance_loss_mlp": 1.03065228, "epoch": 0.14243198557041936, "flos": 19354235990400.0, "grad_norm": 2.6586279461428144, "language_loss": 0.8711772, "learning_rate": 3.868856031585652e-06, "loss": 0.89292705, "num_input_tokens_seen": 51374890, "step": 2369, "time_per_iteration": 2.736872673034668 }, { "auxiliary_loss_clip": 0.01127301, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.05011857, "balance_loss_mlp": 1.02170992, "epoch": 0.14249210882308733, "flos": 28806857982720.0, "grad_norm": 1.7900856007188275, "language_loss": 0.75828248, "learning_rate": 3.868717288576354e-06, "loss": 0.77997375, "num_input_tokens_seen": 51398100, "step": 2370, "time_per_iteration": 2.762603998184204 }, { "auxiliary_loss_clip": 0.01158195, "auxiliary_loss_mlp": 0.00781098, "balance_loss_clip": 1.05268764, "balance_loss_mlp": 1.00028419, "epoch": 0.1425522320757553, "flos": 21835016807040.0, "grad_norm": 1.7770434161065212, "language_loss": 0.82934797, "learning_rate": 3.868578474705109e-06, "loss": 0.84874088, "num_input_tokens_seen": 51418745, "step": 2371, "time_per_iteration": 2.6224656105041504 }, { "auxiliary_loss_clip": 0.01173447, "auxiliary_loss_mlp": 0.0105718, "balance_loss_clip": 1.05837953, "balance_loss_mlp": 1.03638947, "epoch": 0.14261235532842326, "flos": 17311457617920.0, "grad_norm": 2.0431625041319825, "language_loss": 0.82982123, "learning_rate": 3.868439589977181e-06, "loss": 0.85212755, "num_input_tokens_seen": 51437455, "step": 2372, "time_per_iteration": 2.575690269470215 }, { "auxiliary_loss_clip": 0.01172196, "auxiliary_loss_mlp": 0.0105022, "balance_loss_clip": 1.0581125, "balance_loss_mlp": 1.0285356, "epoch": 0.14267247858109125, "flos": 18806741913600.0, "grad_norm": 3.3704326167450582, "language_loss": 0.8438468, "learning_rate": 3.868300634397836e-06, "loss": 0.86607099, "num_input_tokens_seen": 51455710, "step": 2373, "time_per_iteration": 2.7160356044769287 }, { "auxiliary_loss_clip": 0.01141742, "auxiliary_loss_mlp": 0.01055295, "balance_loss_clip": 1.05160809, "balance_loss_mlp": 1.03598261, "epoch": 0.14273260183375922, "flos": 11358904682880.0, "grad_norm": 3.5035356392631836, "language_loss": 0.86027539, "learning_rate": 3.8681616079723445e-06, "loss": 0.88224572, "num_input_tokens_seen": 51471270, "step": 2374, "time_per_iteration": 2.6845595836639404 }, { "auxiliary_loss_clip": 0.01164623, "auxiliary_loss_mlp": 0.01061957, "balance_loss_clip": 1.05515146, "balance_loss_mlp": 1.03996301, "epoch": 0.14279272508642718, "flos": 27567688636800.0, "grad_norm": 1.6059368749673757, "language_loss": 0.79169822, "learning_rate": 3.868022510705977e-06, "loss": 0.81396401, "num_input_tokens_seen": 51492705, "step": 2375, "time_per_iteration": 2.738156795501709 }, { "auxiliary_loss_clip": 0.01163115, "auxiliary_loss_mlp": 0.01058224, "balance_loss_clip": 1.05641222, "balance_loss_mlp": 1.0368259, "epoch": 0.14285284833909515, "flos": 16252559654400.0, "grad_norm": 2.559097553272684, "language_loss": 0.76907504, "learning_rate": 3.867883342604009e-06, "loss": 0.79128844, "num_input_tokens_seen": 51510780, "step": 2376, "time_per_iteration": 2.751178741455078 }, { "auxiliary_loss_clip": 0.01160115, "auxiliary_loss_mlp": 0.0105168, "balance_loss_clip": 1.054515, "balance_loss_mlp": 1.03040111, "epoch": 0.1429129715917631, "flos": 19755609540480.0, "grad_norm": 2.7331999261828592, "language_loss": 0.92795181, "learning_rate": 3.867744103671717e-06, "loss": 0.95006979, "num_input_tokens_seen": 51531400, "step": 2377, "time_per_iteration": 2.6584725379943848 }, { "auxiliary_loss_clip": 0.01147246, "auxiliary_loss_mlp": 0.01061419, "balance_loss_clip": 1.05362535, "balance_loss_mlp": 1.03793442, "epoch": 0.14297309484443108, "flos": 21137092571520.0, "grad_norm": 2.9252003733204894, "language_loss": 0.91754365, "learning_rate": 3.867604793914382e-06, "loss": 0.93963027, "num_input_tokens_seen": 51548215, "step": 2378, "time_per_iteration": 2.8107075691223145 }, { "auxiliary_loss_clip": 0.01164153, "auxiliary_loss_mlp": 0.0105303, "balance_loss_clip": 1.05712187, "balance_loss_mlp": 1.03092849, "epoch": 0.14303321809709904, "flos": 23586667447680.0, "grad_norm": 2.1292902842232966, "language_loss": 0.73961306, "learning_rate": 3.8674654133372864e-06, "loss": 0.76178491, "num_input_tokens_seen": 51566820, "step": 2379, "time_per_iteration": 2.7029881477355957 }, { "auxiliary_loss_clip": 0.01137551, "auxiliary_loss_mlp": 0.01055012, "balance_loss_clip": 1.05204058, "balance_loss_mlp": 1.0330174, "epoch": 0.14309334134976703, "flos": 15888281875200.0, "grad_norm": 2.1898245228218784, "language_loss": 0.78818595, "learning_rate": 3.867325961945714e-06, "loss": 0.81011152, "num_input_tokens_seen": 51585075, "step": 2380, "time_per_iteration": 2.7213294506073 }, { "auxiliary_loss_clip": 0.01126442, "auxiliary_loss_mlp": 0.01057409, "balance_loss_clip": 1.05457354, "balance_loss_mlp": 1.03580785, "epoch": 0.143153464602435, "flos": 16325601960960.0, "grad_norm": 4.699041640805274, "language_loss": 0.87895483, "learning_rate": 3.867186439744955e-06, "loss": 0.90079331, "num_input_tokens_seen": 51603185, "step": 2381, "time_per_iteration": 2.7144110202789307 }, { "auxiliary_loss_clip": 0.01141327, "auxiliary_loss_mlp": 0.01052708, "balance_loss_clip": 1.05200005, "balance_loss_mlp": 1.03088117, "epoch": 0.14321358785510296, "flos": 17092079303040.0, "grad_norm": 2.47508592106904, "language_loss": 0.76396096, "learning_rate": 3.867046846740299e-06, "loss": 0.78590137, "num_input_tokens_seen": 51620880, "step": 2382, "time_per_iteration": 2.6185953617095947 }, { "auxiliary_loss_clip": 0.01132222, "auxiliary_loss_mlp": 0.01054019, "balance_loss_clip": 1.05162048, "balance_loss_mlp": 1.03319359, "epoch": 0.14327371110777093, "flos": 26322916769280.0, "grad_norm": 4.3017095308344375, "language_loss": 0.76636785, "learning_rate": 3.866907182937039e-06, "loss": 0.7882303, "num_input_tokens_seen": 51640170, "step": 2383, "time_per_iteration": 2.7408525943756104 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01052888, "balance_loss_clip": 1.05078864, "balance_loss_mlp": 1.02926064, "epoch": 0.1433338343604389, "flos": 18076462502400.0, "grad_norm": 2.3526544982502284, "language_loss": 0.87649417, "learning_rate": 3.866767448340471e-06, "loss": 0.8984201, "num_input_tokens_seen": 51656580, "step": 2384, "time_per_iteration": 2.6798789501190186 }, { "auxiliary_loss_clip": 0.01164805, "auxiliary_loss_mlp": 0.01053206, "balance_loss_clip": 1.05644679, "balance_loss_mlp": 1.02985239, "epoch": 0.14339395761310686, "flos": 15522783033600.0, "grad_norm": 2.6134761315069284, "language_loss": 0.79340684, "learning_rate": 3.866627642955895e-06, "loss": 0.81558692, "num_input_tokens_seen": 51674645, "step": 2385, "time_per_iteration": 2.5856544971466064 }, { "auxiliary_loss_clip": 0.01156607, "auxiliary_loss_mlp": 0.01042784, "balance_loss_clip": 1.05148256, "balance_loss_mlp": 1.02182722, "epoch": 0.14345408086577485, "flos": 28548767784960.0, "grad_norm": 2.6990187663653247, "language_loss": 0.74960196, "learning_rate": 3.866487766788612e-06, "loss": 0.77159584, "num_input_tokens_seen": 51695770, "step": 2386, "time_per_iteration": 2.6670751571655273 }, { "auxiliary_loss_clip": 0.01171639, "auxiliary_loss_mlp": 0.01048096, "balance_loss_clip": 1.05699563, "balance_loss_mlp": 1.02733016, "epoch": 0.14351420411844282, "flos": 20230061310720.0, "grad_norm": 2.299870083842227, "language_loss": 0.78659731, "learning_rate": 3.866347819843925e-06, "loss": 0.80879462, "num_input_tokens_seen": 51714165, "step": 2387, "time_per_iteration": 2.5805532932281494 }, { "auxiliary_loss_clip": 0.01140581, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.05355716, "balance_loss_mlp": 1.03317428, "epoch": 0.14357432737111078, "flos": 19865029345920.0, "grad_norm": 6.554164509194222, "language_loss": 0.82492924, "learning_rate": 3.866207802127143e-06, "loss": 0.84688807, "num_input_tokens_seen": 51734440, "step": 2388, "time_per_iteration": 2.656609058380127 }, { "auxiliary_loss_clip": 0.01155007, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.0537287, "balance_loss_mlp": 1.02674508, "epoch": 0.14363445062377875, "flos": 28256814040320.0, "grad_norm": 2.5973624291758655, "language_loss": 0.82025754, "learning_rate": 3.866067713643573e-06, "loss": 0.84227914, "num_input_tokens_seen": 51753730, "step": 2389, "time_per_iteration": 4.21793794631958 }, { "auxiliary_loss_clip": 0.01145665, "auxiliary_loss_mlp": 0.01046852, "balance_loss_clip": 1.05107975, "balance_loss_mlp": 1.02513266, "epoch": 0.1436945738764467, "flos": 18186672407040.0, "grad_norm": 3.7970835440683097, "language_loss": 0.83056784, "learning_rate": 3.8659275543985285e-06, "loss": 0.85249299, "num_input_tokens_seen": 51771195, "step": 2390, "time_per_iteration": 2.6859514713287354 }, { "auxiliary_loss_clip": 0.01152608, "auxiliary_loss_mlp": 0.01054404, "balance_loss_clip": 1.05400729, "balance_loss_mlp": 1.0334475, "epoch": 0.14375469712911468, "flos": 27307910499840.0, "grad_norm": 1.8176612067028404, "language_loss": 0.75018179, "learning_rate": 3.865787324397324e-06, "loss": 0.77225184, "num_input_tokens_seen": 51792290, "step": 2391, "time_per_iteration": 5.726900577545166 }, { "auxiliary_loss_clip": 0.01045505, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.03226101, "balance_loss_mlp": 1.0303973, "epoch": 0.14381482038178264, "flos": 56891445287040.0, "grad_norm": 0.8787809928903102, "language_loss": 0.61848003, "learning_rate": 3.865647023645277e-06, "loss": 0.63926852, "num_input_tokens_seen": 51843675, "step": 2392, "time_per_iteration": 3.113558053970337 }, { "auxiliary_loss_clip": 0.01158698, "auxiliary_loss_mlp": 0.01058807, "balance_loss_clip": 1.05467868, "balance_loss_mlp": 1.03608608, "epoch": 0.14387494363445064, "flos": 14282177143680.0, "grad_norm": 2.718376715006273, "language_loss": 0.77346605, "learning_rate": 3.865506652147709e-06, "loss": 0.79564106, "num_input_tokens_seen": 51860285, "step": 2393, "time_per_iteration": 2.6578521728515625 }, { "auxiliary_loss_clip": 0.0116951, "auxiliary_loss_mlp": 0.01052986, "balance_loss_clip": 1.05671048, "balance_loss_mlp": 1.03287578, "epoch": 0.1439350668871186, "flos": 26761493831040.0, "grad_norm": 5.715284956255472, "language_loss": 0.76301813, "learning_rate": 3.865366209909941e-06, "loss": 0.78524309, "num_input_tokens_seen": 51880105, "step": 2394, "time_per_iteration": 4.345217943191528 }, { "auxiliary_loss_clip": 0.01165266, "auxiliary_loss_mlp": 0.01053501, "balance_loss_clip": 1.05325842, "balance_loss_mlp": 1.03365326, "epoch": 0.14399519013978657, "flos": 40700040537600.0, "grad_norm": 2.2496244390836893, "language_loss": 0.85859704, "learning_rate": 3.8652256969372994e-06, "loss": 0.88078463, "num_input_tokens_seen": 51905175, "step": 2395, "time_per_iteration": 2.739717483520508 }, { "auxiliary_loss_clip": 0.0112523, "auxiliary_loss_mlp": 0.01051092, "balance_loss_clip": 1.04946184, "balance_loss_mlp": 1.028669, "epoch": 0.14405531339245453, "flos": 20557530627840.0, "grad_norm": 4.117082508421602, "language_loss": 0.82894099, "learning_rate": 3.865085113235113e-06, "loss": 0.85070425, "num_input_tokens_seen": 51924490, "step": 2396, "time_per_iteration": 2.686732053756714 }, { "auxiliary_loss_clip": 0.01126754, "auxiliary_loss_mlp": 0.00779833, "balance_loss_clip": 1.04752374, "balance_loss_mlp": 1.00036597, "epoch": 0.1441154366451225, "flos": 19572931946880.0, "grad_norm": 6.956399779275871, "language_loss": 0.82801461, "learning_rate": 3.864944458808712e-06, "loss": 0.84708053, "num_input_tokens_seen": 51940490, "step": 2397, "time_per_iteration": 2.742809534072876 }, { "auxiliary_loss_clip": 0.01168871, "auxiliary_loss_mlp": 0.0104994, "balance_loss_clip": 1.05485702, "balance_loss_mlp": 1.02892387, "epoch": 0.14417555989779046, "flos": 18515721922560.0, "grad_norm": 8.355198005975433, "language_loss": 0.8001197, "learning_rate": 3.86480373366343e-06, "loss": 0.82230783, "num_input_tokens_seen": 51957910, "step": 2398, "time_per_iteration": 2.573267936706543 }, { "auxiliary_loss_clip": 0.01152449, "auxiliary_loss_mlp": 0.01053407, "balance_loss_clip": 1.05287588, "balance_loss_mlp": 1.03336823, "epoch": 0.14423568315045843, "flos": 26031681296640.0, "grad_norm": 3.294581575970509, "language_loss": 0.64690518, "learning_rate": 3.864662937804603e-06, "loss": 0.66896379, "num_input_tokens_seen": 51978010, "step": 2399, "time_per_iteration": 2.6831774711608887 }, { "auxiliary_loss_clip": 0.01134916, "auxiliary_loss_mlp": 0.01052493, "balance_loss_clip": 1.04998159, "balance_loss_mlp": 1.03119016, "epoch": 0.14429580640312642, "flos": 21288743792640.0, "grad_norm": 3.586256880371596, "language_loss": 0.82207137, "learning_rate": 3.864522071237571e-06, "loss": 0.84394544, "num_input_tokens_seen": 51998515, "step": 2400, "time_per_iteration": 2.6812663078308105 }, { "auxiliary_loss_clip": 0.01149983, "auxiliary_loss_mlp": 0.01051884, "balance_loss_clip": 1.0567503, "balance_loss_mlp": 1.02954376, "epoch": 0.14435592965579438, "flos": 25627865621760.0, "grad_norm": 2.3908005596579165, "language_loss": 0.74217784, "learning_rate": 3.864381133967676e-06, "loss": 0.76419652, "num_input_tokens_seen": 52019270, "step": 2401, "time_per_iteration": 2.773838520050049 }, { "auxiliary_loss_clip": 0.01137207, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.05065656, "balance_loss_mlp": 1.02671885, "epoch": 0.14441605290846235, "flos": 22965053656320.0, "grad_norm": 2.616063077702737, "language_loss": 0.80771816, "learning_rate": 3.86424012600026e-06, "loss": 0.82956612, "num_input_tokens_seen": 52039315, "step": 2402, "time_per_iteration": 2.786031723022461 }, { "auxiliary_loss_clip": 0.01120897, "auxiliary_loss_mlp": 0.01052115, "balance_loss_clip": 1.04718328, "balance_loss_mlp": 1.02988231, "epoch": 0.14447617616113032, "flos": 17347655548800.0, "grad_norm": 2.397935571801219, "language_loss": 0.84159613, "learning_rate": 3.864099047340673e-06, "loss": 0.86332625, "num_input_tokens_seen": 52056555, "step": 2403, "time_per_iteration": 2.8113911151885986 }, { "auxiliary_loss_clip": 0.01129082, "auxiliary_loss_mlp": 0.00783127, "balance_loss_clip": 1.04854488, "balance_loss_mlp": 1.00030184, "epoch": 0.14453629941379828, "flos": 24060185464320.0, "grad_norm": 2.224282169770823, "language_loss": 0.70142806, "learning_rate": 3.863957897994262e-06, "loss": 0.72055018, "num_input_tokens_seen": 52075800, "step": 2404, "time_per_iteration": 2.7748003005981445 }, { "auxiliary_loss_clip": 0.01144289, "auxiliary_loss_mlp": 0.01051404, "balance_loss_clip": 1.05279732, "balance_loss_mlp": 1.03099549, "epoch": 0.14459642266646625, "flos": 14429554646400.0, "grad_norm": 2.429117427076043, "language_loss": 0.73179376, "learning_rate": 3.863816677966381e-06, "loss": 0.75375068, "num_input_tokens_seen": 52092585, "step": 2405, "time_per_iteration": 2.7927868366241455 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01054584, "balance_loss_clip": 1.04661417, "balance_loss_mlp": 1.0326612, "epoch": 0.14465654591913424, "flos": 9867032179200.0, "grad_norm": 7.089523066959408, "language_loss": 0.73039794, "learning_rate": 3.863675387262386e-06, "loss": 0.75202763, "num_input_tokens_seen": 52108990, "step": 2406, "time_per_iteration": 2.742253303527832 }, { "auxiliary_loss_clip": 0.01157268, "auxiliary_loss_mlp": 0.01054465, "balance_loss_clip": 1.05420268, "balance_loss_mlp": 1.03198171, "epoch": 0.1447166691718022, "flos": 24972926987520.0, "grad_norm": 5.383630788916188, "language_loss": 0.75570732, "learning_rate": 3.8635340258876325e-06, "loss": 0.77782464, "num_input_tokens_seen": 52125385, "step": 2407, "time_per_iteration": 2.654636859893799 }, { "auxiliary_loss_clip": 0.0116674, "auxiliary_loss_mlp": 0.01054642, "balance_loss_clip": 1.05440819, "balance_loss_mlp": 1.03392315, "epoch": 0.14477679242447017, "flos": 21908023200000.0, "grad_norm": 2.0240540465866146, "language_loss": 0.79426706, "learning_rate": 3.8633925938474826e-06, "loss": 0.81648088, "num_input_tokens_seen": 52144985, "step": 2408, "time_per_iteration": 2.663611650466919 }, { "auxiliary_loss_clip": 0.01155332, "auxiliary_loss_mlp": 0.01053557, "balance_loss_clip": 1.05411625, "balance_loss_mlp": 1.03107429, "epoch": 0.14483691567713813, "flos": 20740746925440.0, "grad_norm": 2.249858190268702, "language_loss": 0.82188261, "learning_rate": 3.863251091147299e-06, "loss": 0.84397143, "num_input_tokens_seen": 52163885, "step": 2409, "time_per_iteration": 2.6218342781066895 }, { "auxiliary_loss_clip": 0.01116852, "auxiliary_loss_mlp": 0.01065498, "balance_loss_clip": 1.04859877, "balance_loss_mlp": 1.04340839, "epoch": 0.1448970389298061, "flos": 35407705536000.0, "grad_norm": 3.918408886138166, "language_loss": 0.74477464, "learning_rate": 3.863109517792446e-06, "loss": 0.76659817, "num_input_tokens_seen": 52184325, "step": 2410, "time_per_iteration": 2.8525002002716064 }, { "auxiliary_loss_clip": 0.01166422, "auxiliary_loss_mlp": 0.0105028, "balance_loss_clip": 1.05447876, "balance_loss_mlp": 1.0300622, "epoch": 0.14495716218247406, "flos": 15414368808960.0, "grad_norm": 2.976325973684052, "language_loss": 0.81616414, "learning_rate": 3.8629678737882945e-06, "loss": 0.8383311, "num_input_tokens_seen": 52202740, "step": 2411, "time_per_iteration": 2.580059051513672 }, { "auxiliary_loss_clip": 0.01143671, "auxiliary_loss_mlp": 0.01055066, "balance_loss_clip": 1.05553794, "balance_loss_mlp": 1.03366852, "epoch": 0.14501728543514203, "flos": 33693222493440.0, "grad_norm": 2.049708152728223, "language_loss": 0.69947547, "learning_rate": 3.862826159140214e-06, "loss": 0.72146285, "num_input_tokens_seen": 52223100, "step": 2412, "time_per_iteration": 2.792389392852783 }, { "auxiliary_loss_clip": 0.01153861, "auxiliary_loss_mlp": 0.01047504, "balance_loss_clip": 1.05600309, "balance_loss_mlp": 1.02669024, "epoch": 0.14507740868781002, "flos": 15596112648960.0, "grad_norm": 1.9741671649406984, "language_loss": 0.76655865, "learning_rate": 3.862684373853579e-06, "loss": 0.78857231, "num_input_tokens_seen": 52239690, "step": 2413, "time_per_iteration": 2.6535370349884033 }, { "auxiliary_loss_clip": 0.01072879, "auxiliary_loss_mlp": 0.01028499, "balance_loss_clip": 1.04041791, "balance_loss_mlp": 1.0252564, "epoch": 0.145137531940478, "flos": 66675343438080.0, "grad_norm": 0.9047547971056389, "language_loss": 0.58883119, "learning_rate": 3.8625425179337656e-06, "loss": 0.60984492, "num_input_tokens_seen": 52296705, "step": 2414, "time_per_iteration": 3.1230342388153076 }, { "auxiliary_loss_clip": 0.01059489, "auxiliary_loss_mlp": 0.01009718, "balance_loss_clip": 1.03874373, "balance_loss_mlp": 1.00692892, "epoch": 0.14519765519314595, "flos": 67521578929920.0, "grad_norm": 0.8422279258983576, "language_loss": 0.62171185, "learning_rate": 3.862400591386154e-06, "loss": 0.64240396, "num_input_tokens_seen": 52361830, "step": 2415, "time_per_iteration": 3.1932270526885986 }, { "auxiliary_loss_clip": 0.01151643, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.05383611, "balance_loss_mlp": 1.02500319, "epoch": 0.14525777844581392, "flos": 17198913329280.0, "grad_norm": 2.2913061581681036, "language_loss": 0.71468806, "learning_rate": 3.8622585942161245e-06, "loss": 0.73667121, "num_input_tokens_seen": 52379420, "step": 2416, "time_per_iteration": 2.5892374515533447 }, { "auxiliary_loss_clip": 0.01050816, "auxiliary_loss_mlp": 0.010049, "balance_loss_clip": 1.03675056, "balance_loss_mlp": 1.00211036, "epoch": 0.14531790169848188, "flos": 65404609015680.0, "grad_norm": 0.7147623603004897, "language_loss": 0.6037569, "learning_rate": 3.8621165264290635e-06, "loss": 0.62431407, "num_input_tokens_seen": 52446290, "step": 2417, "time_per_iteration": 3.3065359592437744 }, { "auxiliary_loss_clip": 0.01168766, "auxiliary_loss_mlp": 0.01053548, "balance_loss_clip": 1.05357766, "balance_loss_mlp": 1.03275824, "epoch": 0.14537802495114985, "flos": 32562467372160.0, "grad_norm": 3.7032433533234346, "language_loss": 0.78014368, "learning_rate": 3.861974388030356e-06, "loss": 0.80236679, "num_input_tokens_seen": 52467295, "step": 2418, "time_per_iteration": 2.887986183166504 }, { "auxiliary_loss_clip": 0.01114137, "auxiliary_loss_mlp": 0.01049779, "balance_loss_clip": 1.04354823, "balance_loss_mlp": 1.02911985, "epoch": 0.1454381482038178, "flos": 20226685432320.0, "grad_norm": 2.096300480609688, "language_loss": 0.71208847, "learning_rate": 3.861832179025394e-06, "loss": 0.73372757, "num_input_tokens_seen": 52487295, "step": 2419, "time_per_iteration": 2.764268636703491 }, { "auxiliary_loss_clip": 0.01142427, "auxiliary_loss_mlp": 0.01054976, "balance_loss_clip": 1.05351484, "balance_loss_mlp": 1.03300607, "epoch": 0.1454982714564858, "flos": 22893124671360.0, "grad_norm": 2.414673655978061, "language_loss": 0.89847761, "learning_rate": 3.861689899419569e-06, "loss": 0.92045164, "num_input_tokens_seen": 52504220, "step": 2420, "time_per_iteration": 2.7500016689300537 }, { "auxiliary_loss_clip": 0.01155004, "auxiliary_loss_mlp": 0.01060929, "balance_loss_clip": 1.05202007, "balance_loss_mlp": 1.04072309, "epoch": 0.14555839470915377, "flos": 20229845829120.0, "grad_norm": 2.0953123539002383, "language_loss": 0.82278717, "learning_rate": 3.861547549218276e-06, "loss": 0.8449465, "num_input_tokens_seen": 52521900, "step": 2421, "time_per_iteration": 2.672722816467285 }, { "auxiliary_loss_clip": 0.01099277, "auxiliary_loss_mlp": 0.01056793, "balance_loss_clip": 1.04282439, "balance_loss_mlp": 1.03507352, "epoch": 0.14561851796182174, "flos": 22236282616320.0, "grad_norm": 1.667429152986229, "language_loss": 0.81741488, "learning_rate": 3.861405128426914e-06, "loss": 0.83897555, "num_input_tokens_seen": 52540495, "step": 2422, "time_per_iteration": 2.739992141723633 }, { "auxiliary_loss_clip": 0.01031842, "auxiliary_loss_mlp": 0.00760413, "balance_loss_clip": 1.0271318, "balance_loss_mlp": 1.00019872, "epoch": 0.1456786412144897, "flos": 52636786289280.0, "grad_norm": 0.9102961670465963, "language_loss": 0.63342595, "learning_rate": 3.861262637050883e-06, "loss": 0.65134847, "num_input_tokens_seen": 52603305, "step": 2423, "time_per_iteration": 3.2704036235809326 }, { "auxiliary_loss_clip": 0.01112855, "auxiliary_loss_mlp": 0.00780065, "balance_loss_clip": 1.05457556, "balance_loss_mlp": 1.00038898, "epoch": 0.14573876446715767, "flos": 23221671396480.0, "grad_norm": 2.2239460229896206, "language_loss": 0.82163274, "learning_rate": 3.861120075095585e-06, "loss": 0.84056193, "num_input_tokens_seen": 52623435, "step": 2424, "time_per_iteration": 2.7993249893188477 }, { "auxiliary_loss_clip": 0.01141208, "auxiliary_loss_mlp": 0.01069468, "balance_loss_clip": 1.0535512, "balance_loss_mlp": 1.0496788, "epoch": 0.14579888771982563, "flos": 18114384286080.0, "grad_norm": 2.769336045727131, "language_loss": 0.78602695, "learning_rate": 3.860977442566429e-06, "loss": 0.80813372, "num_input_tokens_seen": 52642255, "step": 2425, "time_per_iteration": 2.698594093322754 }, { "auxiliary_loss_clip": 0.01156078, "auxiliary_loss_mlp": 0.01062133, "balance_loss_clip": 1.05603778, "balance_loss_mlp": 1.04148602, "epoch": 0.14585901097249362, "flos": 23001107932800.0, "grad_norm": 50.77412231982301, "language_loss": 0.83184898, "learning_rate": 3.860834739468821e-06, "loss": 0.85403109, "num_input_tokens_seen": 52658700, "step": 2426, "time_per_iteration": 2.6948676109313965 }, { "auxiliary_loss_clip": 0.01166642, "auxiliary_loss_mlp": 0.01060596, "balance_loss_clip": 1.05706, "balance_loss_mlp": 1.04040194, "epoch": 0.1459191342251616, "flos": 21908669644800.0, "grad_norm": 3.7420612082917475, "language_loss": 0.87215799, "learning_rate": 3.860691965808173e-06, "loss": 0.8944304, "num_input_tokens_seen": 52678140, "step": 2427, "time_per_iteration": 2.6479666233062744 }, { "auxiliary_loss_clip": 0.01128634, "auxiliary_loss_mlp": 0.01064346, "balance_loss_clip": 1.04835391, "balance_loss_mlp": 1.0405997, "epoch": 0.14597925747782955, "flos": 14975504438400.0, "grad_norm": 1.9221483903926033, "language_loss": 0.66815829, "learning_rate": 3.8605491215899e-06, "loss": 0.69008809, "num_input_tokens_seen": 52696825, "step": 2428, "time_per_iteration": 2.6971306800842285 }, { "auxiliary_loss_clip": 0.01155557, "auxiliary_loss_mlp": 0.01059343, "balance_loss_clip": 1.05335426, "balance_loss_mlp": 1.03842235, "epoch": 0.14603938073049752, "flos": 21068898600960.0, "grad_norm": 2.0918238083564242, "language_loss": 0.83231717, "learning_rate": 3.860406206819417e-06, "loss": 0.8544662, "num_input_tokens_seen": 52715125, "step": 2429, "time_per_iteration": 4.283279895782471 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.01053505, "balance_loss_clip": 1.04625869, "balance_loss_mlp": 1.03446746, "epoch": 0.14609950398316549, "flos": 19864777950720.0, "grad_norm": 2.4559042296603746, "language_loss": 0.79087842, "learning_rate": 3.860263221502145e-06, "loss": 0.81262159, "num_input_tokens_seen": 52734015, "step": 2430, "time_per_iteration": 4.197890758514404 }, { "auxiliary_loss_clip": 0.01170782, "auxiliary_loss_mlp": 0.01061965, "balance_loss_clip": 1.05820751, "balance_loss_mlp": 1.04179525, "epoch": 0.14615962723583345, "flos": 22418852469120.0, "grad_norm": 2.4376691278662506, "language_loss": 0.82910693, "learning_rate": 3.860120165643504e-06, "loss": 0.85143435, "num_input_tokens_seen": 52753025, "step": 2431, "time_per_iteration": 4.162708282470703 }, { "auxiliary_loss_clip": 0.011607, "auxiliary_loss_mlp": 0.01060112, "balance_loss_clip": 1.05553937, "balance_loss_mlp": 1.03853524, "epoch": 0.14621975048850142, "flos": 22346241125760.0, "grad_norm": 2.881661839068268, "language_loss": 0.78330141, "learning_rate": 3.859977039248921e-06, "loss": 0.80550951, "num_input_tokens_seen": 52773420, "step": 2432, "time_per_iteration": 2.6907777786254883 }, { "auxiliary_loss_clip": 0.01165399, "auxiliary_loss_mlp": 0.00782861, "balance_loss_clip": 1.05517077, "balance_loss_mlp": 1.00040507, "epoch": 0.1462798737411694, "flos": 24389163152640.0, "grad_norm": 2.3488382544651887, "language_loss": 0.79515982, "learning_rate": 3.859833842323822e-06, "loss": 0.81464243, "num_input_tokens_seen": 52792870, "step": 2433, "time_per_iteration": 2.719841241836548 }, { "auxiliary_loss_clip": 0.01124303, "auxiliary_loss_mlp": 0.01055776, "balance_loss_clip": 1.05385411, "balance_loss_mlp": 1.03484273, "epoch": 0.14633999699383737, "flos": 19244672530560.0, "grad_norm": 2.0782880949269926, "language_loss": 0.77905983, "learning_rate": 3.859690574873638e-06, "loss": 0.80086064, "num_input_tokens_seen": 52811615, "step": 2434, "time_per_iteration": 4.371506929397583 }, { "auxiliary_loss_clip": 0.01066282, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.05327988, "balance_loss_mlp": 1.03022039, "epoch": 0.14640012024650534, "flos": 62660638270080.0, "grad_norm": 0.8566726319617045, "language_loss": 0.58453119, "learning_rate": 3.8595472369038e-06, "loss": 0.60552537, "num_input_tokens_seen": 52873230, "step": 2435, "time_per_iteration": 3.229882001876831 }, { "auxiliary_loss_clip": 0.01160087, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.05263698, "balance_loss_mlp": 1.0257076, "epoch": 0.1464602434991733, "flos": 12276243146880.0, "grad_norm": 3.775553645712452, "language_loss": 0.88436592, "learning_rate": 3.859403828419744e-06, "loss": 0.90641725, "num_input_tokens_seen": 52889325, "step": 2436, "time_per_iteration": 2.568624973297119 }, { "auxiliary_loss_clip": 0.011561, "auxiliary_loss_mlp": 0.00780257, "balance_loss_clip": 1.05587268, "balance_loss_mlp": 1.00041819, "epoch": 0.14652036675184127, "flos": 20922311197440.0, "grad_norm": 2.028718201913856, "language_loss": 0.74904168, "learning_rate": 3.85926034942691e-06, "loss": 0.7684052, "num_input_tokens_seen": 52909705, "step": 2437, "time_per_iteration": 2.6361188888549805 }, { "auxiliary_loss_clip": 0.01165187, "auxiliary_loss_mlp": 0.01050068, "balance_loss_clip": 1.05295086, "balance_loss_mlp": 1.02729869, "epoch": 0.14658049000450923, "flos": 27703681528320.0, "grad_norm": 3.0822234004311033, "language_loss": 0.73914421, "learning_rate": 3.859116799930736e-06, "loss": 0.76129669, "num_input_tokens_seen": 52930300, "step": 2438, "time_per_iteration": 2.7590928077697754 }, { "auxiliary_loss_clip": 0.01154571, "auxiliary_loss_mlp": 0.01046509, "balance_loss_clip": 1.05747688, "balance_loss_mlp": 1.02708936, "epoch": 0.14664061325717723, "flos": 24936513575040.0, "grad_norm": 4.476318678757457, "language_loss": 0.74410725, "learning_rate": 3.858973179936668e-06, "loss": 0.76611805, "num_input_tokens_seen": 52949955, "step": 2439, "time_per_iteration": 2.627037763595581 }, { "auxiliary_loss_clip": 0.01152452, "auxiliary_loss_mlp": 0.01051294, "balance_loss_clip": 1.05477583, "balance_loss_mlp": 1.0309453, "epoch": 0.1467007365098452, "flos": 40297661406720.0, "grad_norm": 2.1583973700525343, "language_loss": 0.74123728, "learning_rate": 3.85882948945015e-06, "loss": 0.76327467, "num_input_tokens_seen": 52972905, "step": 2440, "time_per_iteration": 2.79715633392334 }, { "auxiliary_loss_clip": 0.01160843, "auxiliary_loss_mlp": 0.01044034, "balance_loss_clip": 1.05471611, "balance_loss_mlp": 1.02493691, "epoch": 0.14676085976251316, "flos": 26541074021760.0, "grad_norm": 1.9756103236146798, "language_loss": 0.82730794, "learning_rate": 3.85868572847663e-06, "loss": 0.84935671, "num_input_tokens_seen": 52994850, "step": 2441, "time_per_iteration": 2.6505653858184814 }, { "auxiliary_loss_clip": 0.01152605, "auxiliary_loss_mlp": 0.01049175, "balance_loss_clip": 1.05408478, "balance_loss_mlp": 1.02796757, "epoch": 0.14682098301518112, "flos": 23550110380800.0, "grad_norm": 2.582118236216862, "language_loss": 0.71455544, "learning_rate": 3.858541897021563e-06, "loss": 0.73657322, "num_input_tokens_seen": 53014740, "step": 2442, "time_per_iteration": 2.772648572921753 }, { "auxiliary_loss_clip": 0.0113053, "auxiliary_loss_mlp": 0.0104246, "balance_loss_clip": 1.05283213, "balance_loss_mlp": 1.02224207, "epoch": 0.1468811062678491, "flos": 11651073909120.0, "grad_norm": 3.6780587187273155, "language_loss": 0.81992352, "learning_rate": 3.8583979950904e-06, "loss": 0.84165335, "num_input_tokens_seen": 53029780, "step": 2443, "time_per_iteration": 2.6979780197143555 }, { "auxiliary_loss_clip": 0.01147138, "auxiliary_loss_mlp": 0.0105693, "balance_loss_clip": 1.05402422, "balance_loss_mlp": 1.03474557, "epoch": 0.14694122952051705, "flos": 23002616304000.0, "grad_norm": 3.190851099873364, "language_loss": 0.83093917, "learning_rate": 3.858254022688599e-06, "loss": 0.85297978, "num_input_tokens_seen": 53048620, "step": 2444, "time_per_iteration": 2.7177255153656006 }, { "auxiliary_loss_clip": 0.01134628, "auxiliary_loss_mlp": 0.01051986, "balance_loss_clip": 1.05385137, "balance_loss_mlp": 1.03213811, "epoch": 0.14700135277318502, "flos": 26502972670080.0, "grad_norm": 3.1425569240832414, "language_loss": 0.71183646, "learning_rate": 3.85810997982162e-06, "loss": 0.7337026, "num_input_tokens_seen": 53070055, "step": 2445, "time_per_iteration": 2.735361099243164 }, { "auxiliary_loss_clip": 0.01095177, "auxiliary_loss_mlp": 0.01023118, "balance_loss_clip": 1.05335557, "balance_loss_mlp": 1.01999438, "epoch": 0.147061476025853, "flos": 59449434387840.0, "grad_norm": 0.824990401786658, "language_loss": 0.63083708, "learning_rate": 3.857965866494923e-06, "loss": 0.65202004, "num_input_tokens_seen": 53126945, "step": 2446, "time_per_iteration": 3.0853025913238525 }, { "auxiliary_loss_clip": 0.01120664, "auxiliary_loss_mlp": 0.01045249, "balance_loss_clip": 1.05621576, "balance_loss_mlp": 1.02491164, "epoch": 0.14712159927852098, "flos": 28330897841280.0, "grad_norm": 2.813052009295296, "language_loss": 0.74895924, "learning_rate": 3.857821682713975e-06, "loss": 0.77061838, "num_input_tokens_seen": 53149130, "step": 2447, "time_per_iteration": 2.858643054962158 }, { "auxiliary_loss_clip": 0.01168929, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.0604012, "balance_loss_mlp": 1.02383327, "epoch": 0.14718172253118894, "flos": 27089825074560.0, "grad_norm": 2.2427639286159367, "language_loss": 0.8528471, "learning_rate": 3.857677428484242e-06, "loss": 0.87496543, "num_input_tokens_seen": 53167120, "step": 2448, "time_per_iteration": 2.699781894683838 }, { "auxiliary_loss_clip": 0.01092169, "auxiliary_loss_mlp": 0.01019616, "balance_loss_clip": 1.05051064, "balance_loss_mlp": 1.01654005, "epoch": 0.1472418457838569, "flos": 66706764860160.0, "grad_norm": 0.7683837313264128, "language_loss": 0.56829578, "learning_rate": 3.857533103811195e-06, "loss": 0.58941364, "num_input_tokens_seen": 53227945, "step": 2449, "time_per_iteration": 3.1478211879730225 }, { "auxiliary_loss_clip": 0.01135016, "auxiliary_loss_mlp": 0.01050801, "balance_loss_clip": 1.05464292, "balance_loss_mlp": 1.03023791, "epoch": 0.14730196903652487, "flos": 19573578391680.0, "grad_norm": 1.9048653074507311, "language_loss": 0.85067344, "learning_rate": 3.857388708700307e-06, "loss": 0.87253165, "num_input_tokens_seen": 53244615, "step": 2450, "time_per_iteration": 2.726008653640747 }, { "auxiliary_loss_clip": 0.01158708, "auxiliary_loss_mlp": 0.01049735, "balance_loss_clip": 1.05984712, "balance_loss_mlp": 1.02994645, "epoch": 0.14736209228919284, "flos": 16071031296000.0, "grad_norm": 2.306043539040143, "language_loss": 0.74523091, "learning_rate": 3.857244243157052e-06, "loss": 0.76731533, "num_input_tokens_seen": 53262205, "step": 2451, "time_per_iteration": 2.641082286834717 }, { "auxiliary_loss_clip": 0.01133915, "auxiliary_loss_mlp": 0.01038458, "balance_loss_clip": 1.05399728, "balance_loss_mlp": 1.02031422, "epoch": 0.1474222155418608, "flos": 23039460679680.0, "grad_norm": 1.8026547738986978, "language_loss": 0.82384264, "learning_rate": 3.85709970718691e-06, "loss": 0.84556639, "num_input_tokens_seen": 53282445, "step": 2452, "time_per_iteration": 2.7810096740722656 }, { "auxiliary_loss_clip": 0.01101553, "auxiliary_loss_mlp": 0.01041864, "balance_loss_clip": 1.05924153, "balance_loss_mlp": 1.0238874, "epoch": 0.1474823387945288, "flos": 17018641946880.0, "grad_norm": 1.6675065143572472, "language_loss": 0.74075705, "learning_rate": 3.856955100795361e-06, "loss": 0.76219124, "num_input_tokens_seen": 53299060, "step": 2453, "time_per_iteration": 2.7913167476654053 }, { "auxiliary_loss_clip": 0.01141798, "auxiliary_loss_mlp": 0.0104607, "balance_loss_clip": 1.05557632, "balance_loss_mlp": 1.026353, "epoch": 0.14754246204719676, "flos": 17895041884800.0, "grad_norm": 1.9958141581621542, "language_loss": 0.7558704, "learning_rate": 3.856810423987889e-06, "loss": 0.77774906, "num_input_tokens_seen": 53315970, "step": 2454, "time_per_iteration": 2.7199089527130127 }, { "auxiliary_loss_clip": 0.01147348, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.05733335, "balance_loss_mlp": 1.01864362, "epoch": 0.14760258529986472, "flos": 13079097987840.0, "grad_norm": 2.0858167958418674, "language_loss": 0.83077228, "learning_rate": 3.856665676769979e-06, "loss": 0.85262716, "num_input_tokens_seen": 53332940, "step": 2455, "time_per_iteration": 2.75616192817688 }, { "auxiliary_loss_clip": 0.01130504, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.05704689, "balance_loss_mlp": 1.02452159, "epoch": 0.1476627085525327, "flos": 30806399358720.0, "grad_norm": 2.3702229998953976, "language_loss": 0.83881497, "learning_rate": 3.85652085914712e-06, "loss": 0.86054951, "num_input_tokens_seen": 53353295, "step": 2456, "time_per_iteration": 2.7914254665374756 }, { "auxiliary_loss_clip": 0.01154014, "auxiliary_loss_mlp": 0.01043715, "balance_loss_clip": 1.05863023, "balance_loss_mlp": 1.02514231, "epoch": 0.14772283180520066, "flos": 21689434984320.0, "grad_norm": 2.4172359629848996, "language_loss": 0.84154665, "learning_rate": 3.856375971124805e-06, "loss": 0.86352402, "num_input_tokens_seen": 53373410, "step": 2457, "time_per_iteration": 2.688265323638916 }, { "auxiliary_loss_clip": 0.01155788, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.06250155, "balance_loss_mlp": 1.02529585, "epoch": 0.14778295505786862, "flos": 18770400328320.0, "grad_norm": 6.310680797376285, "language_loss": 0.75692672, "learning_rate": 3.856231012708527e-06, "loss": 0.77891362, "num_input_tokens_seen": 53391430, "step": 2458, "time_per_iteration": 2.698697805404663 }, { "auxiliary_loss_clip": 0.01117404, "auxiliary_loss_mlp": 0.01047753, "balance_loss_clip": 1.05451179, "balance_loss_mlp": 1.02718902, "epoch": 0.1478430783105366, "flos": 22893555634560.0, "grad_norm": 3.1268711361266393, "language_loss": 0.83348328, "learning_rate": 3.856085983903782e-06, "loss": 0.85513484, "num_input_tokens_seen": 53409960, "step": 2459, "time_per_iteration": 2.790552854537964 }, { "auxiliary_loss_clip": 0.01126767, "auxiliary_loss_mlp": 0.01042293, "balance_loss_clip": 1.05070424, "balance_loss_mlp": 1.02435231, "epoch": 0.14790320156320458, "flos": 15085319293440.0, "grad_norm": 3.1203941208753534, "language_loss": 0.7554391, "learning_rate": 3.855940884716071e-06, "loss": 0.77712965, "num_input_tokens_seen": 53426160, "step": 2460, "time_per_iteration": 2.815455675125122 }, { "auxiliary_loss_clip": 0.01134117, "auxiliary_loss_mlp": 0.01056838, "balance_loss_clip": 1.05845904, "balance_loss_mlp": 1.03770471, "epoch": 0.14796332481587254, "flos": 26504768350080.0, "grad_norm": 3.59241393994, "language_loss": 0.81227219, "learning_rate": 3.855795715150896e-06, "loss": 0.83418173, "num_input_tokens_seen": 53448530, "step": 2461, "time_per_iteration": 2.785569190979004 }, { "auxiliary_loss_clip": 0.01156748, "auxiliary_loss_mlp": 0.01051178, "balance_loss_clip": 1.05812359, "balance_loss_mlp": 1.03044713, "epoch": 0.1480234480685405, "flos": 17563191108480.0, "grad_norm": 3.2910626990147183, "language_loss": 0.66117477, "learning_rate": 3.855650475213761e-06, "loss": 0.683254, "num_input_tokens_seen": 53465915, "step": 2462, "time_per_iteration": 2.7222983837127686 }, { "auxiliary_loss_clip": 0.01136035, "auxiliary_loss_mlp": 0.01049537, "balance_loss_clip": 1.05622339, "balance_loss_mlp": 1.02965331, "epoch": 0.14808357132120847, "flos": 53582203232640.0, "grad_norm": 1.8120706772856114, "language_loss": 0.67226064, "learning_rate": 3.8555051649101745e-06, "loss": 0.69411635, "num_input_tokens_seen": 53496055, "step": 2463, "time_per_iteration": 3.0344398021698 }, { "auxiliary_loss_clip": 0.01153077, "auxiliary_loss_mlp": 0.01050435, "balance_loss_clip": 1.05550933, "balance_loss_mlp": 1.0307889, "epoch": 0.14814369457387644, "flos": 19829190551040.0, "grad_norm": 1.9881580745750587, "language_loss": 0.76870739, "learning_rate": 3.855359784245646e-06, "loss": 0.79074258, "num_input_tokens_seen": 53513790, "step": 2464, "time_per_iteration": 2.69480037689209 }, { "auxiliary_loss_clip": 0.01133748, "auxiliary_loss_mlp": 0.01057139, "balance_loss_clip": 1.05392432, "balance_loss_mlp": 1.03769565, "epoch": 0.1482038178265444, "flos": 23914962777600.0, "grad_norm": 1.8401367705559406, "language_loss": 0.79628456, "learning_rate": 3.855214333225688e-06, "loss": 0.81819344, "num_input_tokens_seen": 53533410, "step": 2465, "time_per_iteration": 2.6989939212799072 }, { "auxiliary_loss_clip": 0.01170385, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.06119514, "balance_loss_mlp": 1.03568494, "epoch": 0.1482639410792124, "flos": 24170503109760.0, "grad_norm": 2.005541134809237, "language_loss": 0.76272273, "learning_rate": 3.855068811855817e-06, "loss": 0.78497583, "num_input_tokens_seen": 53554775, "step": 2466, "time_per_iteration": 2.646245002746582 }, { "auxiliary_loss_clip": 0.01018939, "auxiliary_loss_mlp": 0.0114331, "balance_loss_clip": 1.03313899, "balance_loss_mlp": 1.14004362, "epoch": 0.14832406433188036, "flos": 66191051341440.0, "grad_norm": 0.8320983618395327, "language_loss": 0.6004858, "learning_rate": 3.854923220141551e-06, "loss": 0.62210834, "num_input_tokens_seen": 53609675, "step": 2467, "time_per_iteration": 3.33776593208313 }, { "auxiliary_loss_clip": 0.01141854, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.05437851, "balance_loss_mlp": 1.02509522, "epoch": 0.14838418758454833, "flos": 25411252654080.0, "grad_norm": 2.92694776694492, "language_loss": 0.87666196, "learning_rate": 3.85477755808841e-06, "loss": 0.89852077, "num_input_tokens_seen": 53626950, "step": 2468, "time_per_iteration": 4.266207456588745 }, { "auxiliary_loss_clip": 0.01130189, "auxiliary_loss_mlp": 0.01048186, "balance_loss_clip": 1.05255163, "balance_loss_mlp": 1.02782488, "epoch": 0.1484443108372163, "flos": 23289901280640.0, "grad_norm": 2.2284173124426223, "language_loss": 0.7598694, "learning_rate": 3.854631825701919e-06, "loss": 0.78165317, "num_input_tokens_seen": 53644200, "step": 2469, "time_per_iteration": 4.217481851577759 }, { "auxiliary_loss_clip": 0.01126269, "auxiliary_loss_mlp": 0.0104139, "balance_loss_clip": 1.05208421, "balance_loss_mlp": 1.02251911, "epoch": 0.14850443408988426, "flos": 14647675985280.0, "grad_norm": 6.591244267451795, "language_loss": 0.75895017, "learning_rate": 3.854486022987603e-06, "loss": 0.78062677, "num_input_tokens_seen": 53659650, "step": 2470, "time_per_iteration": 2.7157187461853027 }, { "auxiliary_loss_clip": 0.01161157, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.05831027, "balance_loss_mlp": 1.02571499, "epoch": 0.14856455734255222, "flos": 23548314700800.0, "grad_norm": 1.8610043660805562, "language_loss": 0.7215873, "learning_rate": 3.8543401499509905e-06, "loss": 0.74364614, "num_input_tokens_seen": 53680275, "step": 2471, "time_per_iteration": 4.162387132644653 }, { "auxiliary_loss_clip": 0.01135244, "auxiliary_loss_mlp": 0.01047611, "balance_loss_clip": 1.05438995, "balance_loss_mlp": 1.02717888, "epoch": 0.1486246805952202, "flos": 18077288515200.0, "grad_norm": 1.979025280241548, "language_loss": 0.89558828, "learning_rate": 3.854194206597615e-06, "loss": 0.91741687, "num_input_tokens_seen": 53698270, "step": 2472, "time_per_iteration": 2.739457607269287 }, { "auxiliary_loss_clip": 0.01134625, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 1.06334805, "balance_loss_mlp": 1.02964163, "epoch": 0.14868480384788818, "flos": 19353625459200.0, "grad_norm": 2.6029609251362764, "language_loss": 0.80801564, "learning_rate": 3.854048192933008e-06, "loss": 0.82985294, "num_input_tokens_seen": 53716845, "step": 2473, "time_per_iteration": 4.412883758544922 }, { "auxiliary_loss_clip": 0.01161034, "auxiliary_loss_mlp": 0.01051306, "balance_loss_clip": 1.0626657, "balance_loss_mlp": 1.03267312, "epoch": 0.14874492710055615, "flos": 22200192426240.0, "grad_norm": 3.426519274325147, "language_loss": 0.77372944, "learning_rate": 3.853902108962709e-06, "loss": 0.79585278, "num_input_tokens_seen": 53734970, "step": 2474, "time_per_iteration": 2.6879520416259766 }, { "auxiliary_loss_clip": 0.01124216, "auxiliary_loss_mlp": 0.01059785, "balance_loss_clip": 1.05597806, "balance_loss_mlp": 1.04041362, "epoch": 0.1488050503532241, "flos": 21103444506240.0, "grad_norm": 2.4771626433268734, "language_loss": 0.82151824, "learning_rate": 3.853755954692255e-06, "loss": 0.84335828, "num_input_tokens_seen": 53753415, "step": 2475, "time_per_iteration": 2.7828469276428223 }, { "auxiliary_loss_clip": 0.01115855, "auxiliary_loss_mlp": 0.01052322, "balance_loss_clip": 1.0614953, "balance_loss_mlp": 1.03341544, "epoch": 0.14886517360589208, "flos": 12786569625600.0, "grad_norm": 1.9349243252831771, "language_loss": 0.80917645, "learning_rate": 3.85360973012719e-06, "loss": 0.83085823, "num_input_tokens_seen": 53770305, "step": 2476, "time_per_iteration": 2.7227590084075928 }, { "auxiliary_loss_clip": 0.01156019, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.06338036, "balance_loss_mlp": 1.03216898, "epoch": 0.14892529685856004, "flos": 29022860419200.0, "grad_norm": 2.0032169897498346, "language_loss": 0.77659523, "learning_rate": 3.853463435273058e-06, "loss": 0.79865897, "num_input_tokens_seen": 53788895, "step": 2477, "time_per_iteration": 2.740241765975952 }, { "auxiliary_loss_clip": 0.0110234, "auxiliary_loss_mlp": 0.01092005, "balance_loss_clip": 1.07879949, "balance_loss_mlp": 1.08730817, "epoch": 0.148985420111228, "flos": 61926121054080.0, "grad_norm": 0.8188153224748298, "language_loss": 0.60153681, "learning_rate": 3.853317070135407e-06, "loss": 0.62348026, "num_input_tokens_seen": 53850260, "step": 2478, "time_per_iteration": 3.2467947006225586 }, { "auxiliary_loss_clip": 0.01107417, "auxiliary_loss_mlp": 0.01048452, "balance_loss_clip": 1.0516423, "balance_loss_mlp": 1.03041577, "epoch": 0.149045543363896, "flos": 23915106432000.0, "grad_norm": 2.666109649137694, "language_loss": 0.7139731, "learning_rate": 3.853170634719787e-06, "loss": 0.73553181, "num_input_tokens_seen": 53867520, "step": 2479, "time_per_iteration": 2.7973475456237793 }, { "auxiliary_loss_clip": 0.01140551, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.05563831, "balance_loss_mlp": 1.02407789, "epoch": 0.14910566661656396, "flos": 23654394541440.0, "grad_norm": 1.7687137634424535, "language_loss": 0.80758464, "learning_rate": 3.853024129031751e-06, "loss": 0.82942122, "num_input_tokens_seen": 53886620, "step": 2480, "time_per_iteration": 2.7238829135894775 }, { "auxiliary_loss_clip": 0.01138106, "auxiliary_loss_mlp": 0.0104537, "balance_loss_clip": 1.0584991, "balance_loss_mlp": 1.02627277, "epoch": 0.14916578986923193, "flos": 20515299212160.0, "grad_norm": 4.65741826395702, "language_loss": 0.84375542, "learning_rate": 3.852877553076854e-06, "loss": 0.86559021, "num_input_tokens_seen": 53902230, "step": 2481, "time_per_iteration": 2.791550874710083 }, { "auxiliary_loss_clip": 0.01149484, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.05772805, "balance_loss_mlp": 1.02948999, "epoch": 0.1492259131218999, "flos": 22491822948480.0, "grad_norm": 8.035113387353048, "language_loss": 0.77703977, "learning_rate": 3.8527309068606546e-06, "loss": 0.79903734, "num_input_tokens_seen": 53919475, "step": 2482, "time_per_iteration": 2.7310593128204346 }, { "auxiliary_loss_clip": 0.01133163, "auxiliary_loss_mlp": 0.01040426, "balance_loss_clip": 1.05452228, "balance_loss_mlp": 1.02032781, "epoch": 0.14928603637456786, "flos": 23185868515200.0, "grad_norm": 2.207731010812049, "language_loss": 0.78967929, "learning_rate": 3.852584190388713e-06, "loss": 0.81141514, "num_input_tokens_seen": 53939150, "step": 2483, "time_per_iteration": 2.749671220779419 }, { "auxiliary_loss_clip": 0.01154122, "auxiliary_loss_mlp": 0.00776708, "balance_loss_clip": 1.06144214, "balance_loss_mlp": 1.00029397, "epoch": 0.14934615962723582, "flos": 21653237053440.0, "grad_norm": 2.020127706544282, "language_loss": 0.70361555, "learning_rate": 3.852437403666595e-06, "loss": 0.72292387, "num_input_tokens_seen": 53958735, "step": 2484, "time_per_iteration": 2.737781524658203 }, { "auxiliary_loss_clip": 0.01141919, "auxiliary_loss_mlp": 0.00778215, "balance_loss_clip": 1.05718136, "balance_loss_mlp": 1.00030363, "epoch": 0.1494062828799038, "flos": 27010066924800.0, "grad_norm": 2.165877689982274, "language_loss": 0.84666765, "learning_rate": 3.852290546699863e-06, "loss": 0.86586899, "num_input_tokens_seen": 53975065, "step": 2485, "time_per_iteration": 2.697976589202881 }, { "auxiliary_loss_clip": 0.01145272, "auxiliary_loss_mlp": 0.0104224, "balance_loss_clip": 1.05639958, "balance_loss_mlp": 1.02257001, "epoch": 0.14946640613257178, "flos": 21214947300480.0, "grad_norm": 2.5229241908443023, "language_loss": 0.8476423, "learning_rate": 3.8521436194940894e-06, "loss": 0.86951739, "num_input_tokens_seen": 53993330, "step": 2486, "time_per_iteration": 2.6799628734588623 }, { "auxiliary_loss_clip": 0.01149031, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.05667424, "balance_loss_mlp": 1.0230875, "epoch": 0.14952652938523975, "flos": 13370872164480.0, "grad_norm": 2.1822908802725203, "language_loss": 0.74762607, "learning_rate": 3.851996622054842e-06, "loss": 0.76950949, "num_input_tokens_seen": 54010515, "step": 2487, "time_per_iteration": 2.8037290573120117 }, { "auxiliary_loss_clip": 0.01153097, "auxiliary_loss_mlp": 0.01044274, "balance_loss_clip": 1.05934322, "balance_loss_mlp": 1.02611899, "epoch": 0.1495866526379077, "flos": 35517699959040.0, "grad_norm": 16.320028017118723, "language_loss": 0.72210175, "learning_rate": 3.8518495543877e-06, "loss": 0.74407548, "num_input_tokens_seen": 54031315, "step": 2488, "time_per_iteration": 2.8031094074249268 }, { "auxiliary_loss_clip": 0.01137536, "auxiliary_loss_mlp": 0.01054916, "balance_loss_clip": 1.05569518, "balance_loss_mlp": 1.03636682, "epoch": 0.14964677589057568, "flos": 17632749795840.0, "grad_norm": 3.2458980886023143, "language_loss": 0.71352434, "learning_rate": 3.851702416498235e-06, "loss": 0.73544884, "num_input_tokens_seen": 54045965, "step": 2489, "time_per_iteration": 2.648883819580078 }, { "auxiliary_loss_clip": 0.0113767, "auxiliary_loss_mlp": 0.01052603, "balance_loss_clip": 1.05376494, "balance_loss_mlp": 1.03357768, "epoch": 0.14970689914324364, "flos": 20185280029440.0, "grad_norm": 3.893198448080141, "language_loss": 0.81559736, "learning_rate": 3.8515552083920295e-06, "loss": 0.8375001, "num_input_tokens_seen": 54059960, "step": 2490, "time_per_iteration": 2.702808380126953 }, { "auxiliary_loss_clip": 0.01125097, "auxiliary_loss_mlp": 0.01055928, "balance_loss_clip": 1.05606139, "balance_loss_mlp": 1.03803492, "epoch": 0.1497670223959116, "flos": 37228699382400.0, "grad_norm": 1.9071281232744548, "language_loss": 0.80057055, "learning_rate": 3.851407930074666e-06, "loss": 0.82238084, "num_input_tokens_seen": 54079330, "step": 2491, "time_per_iteration": 2.833272933959961 }, { "auxiliary_loss_clip": 0.01143407, "auxiliary_loss_mlp": 0.01052558, "balance_loss_clip": 1.05301452, "balance_loss_mlp": 1.03195894, "epoch": 0.1498271456485796, "flos": 24455848752000.0, "grad_norm": 2.3105790695512294, "language_loss": 0.90820229, "learning_rate": 3.851260581551727e-06, "loss": 0.93016195, "num_input_tokens_seen": 54097555, "step": 2492, "time_per_iteration": 2.684178352355957 }, { "auxiliary_loss_clip": 0.01152331, "auxiliary_loss_mlp": 0.01063543, "balance_loss_clip": 1.05835843, "balance_loss_mlp": 1.04508913, "epoch": 0.14988726890124757, "flos": 16253601148800.0, "grad_norm": 6.881290297472923, "language_loss": 0.79406559, "learning_rate": 3.851113162828802e-06, "loss": 0.81622434, "num_input_tokens_seen": 54115600, "step": 2493, "time_per_iteration": 2.6558918952941895 }, { "auxiliary_loss_clip": 0.0114858, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.05345511, "balance_loss_mlp": 1.03258693, "epoch": 0.14994739215391553, "flos": 20666555383680.0, "grad_norm": 2.3431247769189967, "language_loss": 0.79894584, "learning_rate": 3.85096567391148e-06, "loss": 0.82095182, "num_input_tokens_seen": 54135220, "step": 2494, "time_per_iteration": 2.6774168014526367 }, { "auxiliary_loss_clip": 0.01137216, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.05474579, "balance_loss_mlp": 1.03212965, "epoch": 0.1500075154065835, "flos": 70652375239680.0, "grad_norm": 1.928941284350508, "language_loss": 0.66480517, "learning_rate": 3.850818114805354e-06, "loss": 0.68668592, "num_input_tokens_seen": 54161065, "step": 2495, "time_per_iteration": 3.1090729236602783 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01038654, "balance_loss_clip": 1.06896818, "balance_loss_mlp": 1.03560257, "epoch": 0.15006763865925146, "flos": 68011937447040.0, "grad_norm": 0.9030283421527312, "language_loss": 0.59524739, "learning_rate": 3.850670485516019e-06, "loss": 0.61666763, "num_input_tokens_seen": 54225095, "step": 2496, "time_per_iteration": 3.2250726222991943 }, { "auxiliary_loss_clip": 0.01163934, "auxiliary_loss_mlp": 0.01055725, "balance_loss_clip": 1.05690169, "balance_loss_mlp": 1.0360074, "epoch": 0.15012776191191943, "flos": 18916269459840.0, "grad_norm": 3.063784198565679, "language_loss": 0.65276247, "learning_rate": 3.850522786049075e-06, "loss": 0.67495906, "num_input_tokens_seen": 54243750, "step": 2497, "time_per_iteration": 2.619946002960205 }, { "auxiliary_loss_clip": 0.01125657, "auxiliary_loss_mlp": 0.01054091, "balance_loss_clip": 1.05308235, "balance_loss_mlp": 1.03316998, "epoch": 0.1501878851645874, "flos": 23701330638720.0, "grad_norm": 1.5552670947231086, "language_loss": 0.75182658, "learning_rate": 3.850375016410121e-06, "loss": 0.77362406, "num_input_tokens_seen": 54266185, "step": 2498, "time_per_iteration": 2.778163433074951 }, { "auxiliary_loss_clip": 0.01132738, "auxiliary_loss_mlp": 0.01046919, "balance_loss_clip": 1.05919099, "balance_loss_mlp": 1.02701163, "epoch": 0.15024800841725539, "flos": 20412523422720.0, "grad_norm": 3.357364003851319, "language_loss": 0.71821117, "learning_rate": 3.850227176604761e-06, "loss": 0.74000776, "num_input_tokens_seen": 54283940, "step": 2499, "time_per_iteration": 2.6929259300231934 }, { "auxiliary_loss_clip": 0.01134239, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.0547812, "balance_loss_mlp": 1.03236222, "epoch": 0.15030813166992335, "flos": 31831002812160.0, "grad_norm": 2.1406696998963652, "language_loss": 0.7206136, "learning_rate": 3.850079266638601e-06, "loss": 0.7424742, "num_input_tokens_seen": 54304830, "step": 2500, "time_per_iteration": 2.769988536834717 }, { "auxiliary_loss_clip": 0.01134021, "auxiliary_loss_mlp": 0.0105021, "balance_loss_clip": 1.06063724, "balance_loss_mlp": 1.03181624, "epoch": 0.15036825492259132, "flos": 35657822914560.0, "grad_norm": 2.0251881980439306, "language_loss": 0.65127194, "learning_rate": 3.849931286517249e-06, "loss": 0.6731143, "num_input_tokens_seen": 54325595, "step": 2501, "time_per_iteration": 2.810945510864258 }, { "auxiliary_loss_clip": 0.01137877, "auxiliary_loss_mlp": 0.01055223, "balance_loss_clip": 1.0541079, "balance_loss_mlp": 1.03511274, "epoch": 0.15042837817525928, "flos": 18838163335680.0, "grad_norm": 2.209666371186328, "language_loss": 0.83401144, "learning_rate": 3.849783236246318e-06, "loss": 0.85594243, "num_input_tokens_seen": 54342180, "step": 2502, "time_per_iteration": 2.6780545711517334 }, { "auxiliary_loss_clip": 0.01122961, "auxiliary_loss_mlp": 0.01049887, "balance_loss_clip": 1.05318308, "balance_loss_mlp": 1.0323875, "epoch": 0.15048850142792725, "flos": 19535548867200.0, "grad_norm": 2.0319272128830947, "language_loss": 0.77134645, "learning_rate": 3.849635115831421e-06, "loss": 0.79307491, "num_input_tokens_seen": 54360255, "step": 2503, "time_per_iteration": 2.7579123973846436 }, { "auxiliary_loss_clip": 0.01159116, "auxiliary_loss_mlp": 0.01044094, "balance_loss_clip": 1.05766046, "balance_loss_mlp": 1.02692807, "epoch": 0.1505486246805952, "flos": 22017550746240.0, "grad_norm": 1.9852139459946199, "language_loss": 0.85514295, "learning_rate": 3.849486925278176e-06, "loss": 0.87717503, "num_input_tokens_seen": 54378260, "step": 2504, "time_per_iteration": 2.631882905960083 }, { "auxiliary_loss_clip": 0.01146113, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.05622697, "balance_loss_mlp": 1.03098798, "epoch": 0.15060874793326318, "flos": 20743153136640.0, "grad_norm": 1.8222645508164372, "language_loss": 0.83178544, "learning_rate": 3.8493386645922e-06, "loss": 0.85372692, "num_input_tokens_seen": 54399745, "step": 2505, "time_per_iteration": 2.7706007957458496 }, { "auxiliary_loss_clip": 0.01125699, "auxiliary_loss_mlp": 0.01053819, "balance_loss_clip": 1.05586648, "balance_loss_mlp": 1.03590202, "epoch": 0.15066887118593117, "flos": 16471902055680.0, "grad_norm": 2.0148067518000445, "language_loss": 0.76044405, "learning_rate": 3.849190333779117e-06, "loss": 0.7822392, "num_input_tokens_seen": 54417105, "step": 2506, "time_per_iteration": 2.70989990234375 }, { "auxiliary_loss_clip": 0.01165314, "auxiliary_loss_mlp": 0.01041911, "balance_loss_clip": 1.05785728, "balance_loss_mlp": 1.02305174, "epoch": 0.15072899443859913, "flos": 19859319083520.0, "grad_norm": 2.823460856599666, "language_loss": 0.76220375, "learning_rate": 3.849041932844552e-06, "loss": 0.78427601, "num_input_tokens_seen": 54433920, "step": 2507, "time_per_iteration": 2.5367634296417236 }, { "auxiliary_loss_clip": 0.01144479, "auxiliary_loss_mlp": 0.01041094, "balance_loss_clip": 1.05261898, "balance_loss_mlp": 1.02306986, "epoch": 0.1507891176912671, "flos": 20776226584320.0, "grad_norm": 2.5197772895304906, "language_loss": 0.68633789, "learning_rate": 3.848893461794131e-06, "loss": 0.70819366, "num_input_tokens_seen": 54451540, "step": 2508, "time_per_iteration": 4.303388833999634 }, { "auxiliary_loss_clip": 0.01130299, "auxiliary_loss_mlp": 0.01046507, "balance_loss_clip": 1.05477214, "balance_loss_mlp": 1.02835178, "epoch": 0.15084924094393506, "flos": 23586631534080.0, "grad_norm": 2.840517748098311, "language_loss": 0.77994299, "learning_rate": 3.8487449206334845e-06, "loss": 0.80171108, "num_input_tokens_seen": 54470800, "step": 2509, "time_per_iteration": 4.380200147628784 }, { "auxiliary_loss_clip": 0.01141335, "auxiliary_loss_mlp": 0.00776843, "balance_loss_clip": 1.05463386, "balance_loss_mlp": 1.00027037, "epoch": 0.15090936419660303, "flos": 18911313383040.0, "grad_norm": 2.53406994590866, "language_loss": 0.79959804, "learning_rate": 3.848596309368246e-06, "loss": 0.81877983, "num_input_tokens_seen": 54486525, "step": 2510, "time_per_iteration": 4.219487428665161 }, { "auxiliary_loss_clip": 0.01150641, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.05529225, "balance_loss_mlp": 1.02794981, "epoch": 0.150969487449271, "flos": 17928223073280.0, "grad_norm": 1.8628702139594306, "language_loss": 0.73398602, "learning_rate": 3.8484476280040495e-06, "loss": 0.75596589, "num_input_tokens_seen": 54503795, "step": 2511, "time_per_iteration": 2.62237811088562 }, { "auxiliary_loss_clip": 0.01094269, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04747009, "balance_loss_mlp": 1.02365553, "epoch": 0.151029610701939, "flos": 24243078539520.0, "grad_norm": 2.20399257021602, "language_loss": 0.68716824, "learning_rate": 3.848298876546534e-06, "loss": 0.70853454, "num_input_tokens_seen": 54523025, "step": 2512, "time_per_iteration": 2.823359489440918 }, { "auxiliary_loss_clip": 0.01149398, "auxiliary_loss_mlp": 0.01043296, "balance_loss_clip": 1.05574036, "balance_loss_mlp": 1.02615356, "epoch": 0.15108973395460695, "flos": 30262496641920.0, "grad_norm": 2.6278607305338877, "language_loss": 0.73833561, "learning_rate": 3.84815005500134e-06, "loss": 0.76026255, "num_input_tokens_seen": 54545025, "step": 2513, "time_per_iteration": 4.386258602142334 }, { "auxiliary_loss_clip": 0.01059691, "auxiliary_loss_mlp": 0.01109321, "balance_loss_clip": 1.0685482, "balance_loss_mlp": 1.10529137, "epoch": 0.15114985720727492, "flos": 60437624428800.0, "grad_norm": 0.9017688875456507, "language_loss": 0.64720047, "learning_rate": 3.84800116337411e-06, "loss": 0.6688906, "num_input_tokens_seen": 54604545, "step": 2514, "time_per_iteration": 3.254983425140381 }, { "auxiliary_loss_clip": 0.01146323, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.05674648, "balance_loss_mlp": 1.02584124, "epoch": 0.15120998045994288, "flos": 20521691832960.0, "grad_norm": 3.178381755435586, "language_loss": 0.72995645, "learning_rate": 3.8478522016704916e-06, "loss": 0.7518549, "num_input_tokens_seen": 54620590, "step": 2515, "time_per_iteration": 2.67921781539917 }, { "auxiliary_loss_clip": 0.01133382, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.05675673, "balance_loss_mlp": 1.02120531, "epoch": 0.15127010371261085, "flos": 21178893024000.0, "grad_norm": 2.0712989062813243, "language_loss": 0.7773214, "learning_rate": 3.8477031698961325e-06, "loss": 0.79905832, "num_input_tokens_seen": 54640410, "step": 2516, "time_per_iteration": 2.763467788696289 }, { "auxiliary_loss_clip": 0.01087601, "auxiliary_loss_mlp": 0.01004779, "balance_loss_clip": 1.05344796, "balance_loss_mlp": 1.00160813, "epoch": 0.1513302269652788, "flos": 65320648974720.0, "grad_norm": 0.7270407819118658, "language_loss": 0.54622567, "learning_rate": 3.8475540680566835e-06, "loss": 0.56714946, "num_input_tokens_seen": 54701430, "step": 2517, "time_per_iteration": 3.2293660640716553 }, { "auxiliary_loss_clip": 0.01110142, "auxiliary_loss_mlp": 0.0104362, "balance_loss_clip": 1.04499209, "balance_loss_mlp": 1.02427244, "epoch": 0.15139035021794678, "flos": 19135827342720.0, "grad_norm": 3.035771526476276, "language_loss": 0.78264821, "learning_rate": 3.8474048961577995e-06, "loss": 0.80418587, "num_input_tokens_seen": 54720845, "step": 2518, "time_per_iteration": 2.8154754638671875 }, { "auxiliary_loss_clip": 0.01147342, "auxiliary_loss_mlp": 0.01056368, "balance_loss_clip": 1.05279088, "balance_loss_mlp": 1.03681803, "epoch": 0.15145047347061477, "flos": 26578564842240.0, "grad_norm": 2.1881526177791097, "language_loss": 0.70480245, "learning_rate": 3.847255654205137e-06, "loss": 0.72683954, "num_input_tokens_seen": 54740495, "step": 2519, "time_per_iteration": 2.7098515033721924 }, { "auxiliary_loss_clip": 0.01152463, "auxiliary_loss_mlp": 0.01056975, "balance_loss_clip": 1.05683672, "balance_loss_mlp": 1.03802037, "epoch": 0.15151059672328274, "flos": 20302959962880.0, "grad_norm": 1.9048594994100874, "language_loss": 0.78681207, "learning_rate": 3.847106342204354e-06, "loss": 0.80890644, "num_input_tokens_seen": 54758415, "step": 2520, "time_per_iteration": 2.664187431335449 }, { "auxiliary_loss_clip": 0.01140573, "auxiliary_loss_mlp": 0.01071607, "balance_loss_clip": 1.05435348, "balance_loss_mlp": 1.05244994, "epoch": 0.1515707199759507, "flos": 27228367831680.0, "grad_norm": 3.950911503454746, "language_loss": 0.74849677, "learning_rate": 3.846956960161114e-06, "loss": 0.77061862, "num_input_tokens_seen": 54779355, "step": 2521, "time_per_iteration": 2.7900772094726562 }, { "auxiliary_loss_clip": 0.01132038, "auxiliary_loss_mlp": 0.01055874, "balance_loss_clip": 1.05052209, "balance_loss_mlp": 1.0360136, "epoch": 0.15163084322861867, "flos": 23587349806080.0, "grad_norm": 4.620979243079986, "language_loss": 0.8253814, "learning_rate": 3.84680750808108e-06, "loss": 0.84726053, "num_input_tokens_seen": 54799465, "step": 2522, "time_per_iteration": 2.7216525077819824 }, { "auxiliary_loss_clip": 0.01051858, "auxiliary_loss_mlp": 0.01048797, "balance_loss_clip": 1.05645704, "balance_loss_mlp": 1.04595995, "epoch": 0.15169096648128663, "flos": 66889622021760.0, "grad_norm": 0.8362305181264502, "language_loss": 0.57885599, "learning_rate": 3.846657985969922e-06, "loss": 0.59986252, "num_input_tokens_seen": 54857665, "step": 2523, "time_per_iteration": 3.2375056743621826 }, { "auxiliary_loss_clip": 0.0114147, "auxiliary_loss_mlp": 0.01057964, "balance_loss_clip": 1.05213499, "balance_loss_mlp": 1.0368042, "epoch": 0.1517510897339546, "flos": 29095435848960.0, "grad_norm": 1.8054087157705183, "language_loss": 0.74795163, "learning_rate": 3.8465083938333066e-06, "loss": 0.76994598, "num_input_tokens_seen": 54879895, "step": 2524, "time_per_iteration": 2.711557388305664 }, { "auxiliary_loss_clip": 0.01138185, "auxiliary_loss_mlp": 0.01057236, "balance_loss_clip": 1.05304718, "balance_loss_mlp": 1.03865099, "epoch": 0.1518112129866226, "flos": 18406553512320.0, "grad_norm": 1.8255227790100423, "language_loss": 0.74631184, "learning_rate": 3.8463587316769085e-06, "loss": 0.76826608, "num_input_tokens_seen": 54898245, "step": 2525, "time_per_iteration": 2.6936984062194824 }, { "auxiliary_loss_clip": 0.01144047, "auxiliary_loss_mlp": 0.01057009, "balance_loss_clip": 1.05403006, "balance_loss_mlp": 1.03747034, "epoch": 0.15187133623929056, "flos": 19425410789760.0, "grad_norm": 1.8907352833287865, "language_loss": 0.79600316, "learning_rate": 3.846208999506402e-06, "loss": 0.81801373, "num_input_tokens_seen": 54917060, "step": 2526, "time_per_iteration": 2.651494264602661 }, { "auxiliary_loss_clip": 0.01135228, "auxiliary_loss_mlp": 0.01047798, "balance_loss_clip": 1.05538774, "balance_loss_mlp": 1.03056002, "epoch": 0.15193145949195852, "flos": 17566207850880.0, "grad_norm": 1.7677336965262924, "language_loss": 0.8443349, "learning_rate": 3.846059197327466e-06, "loss": 0.86616516, "num_input_tokens_seen": 54936365, "step": 2527, "time_per_iteration": 2.702683448791504 }, { "auxiliary_loss_clip": 0.01124925, "auxiliary_loss_mlp": 0.01049207, "balance_loss_clip": 1.04976487, "balance_loss_mlp": 1.02985954, "epoch": 0.15199158274462649, "flos": 36176265866880.0, "grad_norm": 1.85678489681458, "language_loss": 0.69361663, "learning_rate": 3.845909325145779e-06, "loss": 0.7153579, "num_input_tokens_seen": 54961365, "step": 2528, "time_per_iteration": 2.9250690937042236 }, { "auxiliary_loss_clip": 0.01134092, "auxiliary_loss_mlp": 0.01055056, "balance_loss_clip": 1.05266535, "balance_loss_mlp": 1.03587484, "epoch": 0.15205170599729445, "flos": 23074042498560.0, "grad_norm": 2.004144148858156, "language_loss": 0.86482549, "learning_rate": 3.845759382967026e-06, "loss": 0.88671696, "num_input_tokens_seen": 54980750, "step": 2529, "time_per_iteration": 2.7277863025665283 }, { "auxiliary_loss_clip": 0.01124798, "auxiliary_loss_mlp": 0.01041651, "balance_loss_clip": 1.05046487, "balance_loss_mlp": 1.02297091, "epoch": 0.15211182924996242, "flos": 21908382336000.0, "grad_norm": 2.544775548600603, "language_loss": 0.83399373, "learning_rate": 3.845609370796893e-06, "loss": 0.85565823, "num_input_tokens_seen": 54999675, "step": 2530, "time_per_iteration": 2.8717291355133057 }, { "auxiliary_loss_clip": 0.01125761, "auxiliary_loss_mlp": 0.01048121, "balance_loss_clip": 1.05035281, "balance_loss_mlp": 1.02940559, "epoch": 0.15217195250263038, "flos": 13881521865600.0, "grad_norm": 2.1410437006568723, "language_loss": 0.80404246, "learning_rate": 3.845459288641066e-06, "loss": 0.82578129, "num_input_tokens_seen": 55018295, "step": 2531, "time_per_iteration": 2.8444995880126953 }, { "auxiliary_loss_clip": 0.01143114, "auxiliary_loss_mlp": 0.01043494, "balance_loss_clip": 1.05216551, "balance_loss_mlp": 1.02613723, "epoch": 0.15223207575529837, "flos": 24535319592960.0, "grad_norm": 1.7922494378130023, "language_loss": 0.78874445, "learning_rate": 3.8453091365052394e-06, "loss": 0.81061059, "num_input_tokens_seen": 55037975, "step": 2532, "time_per_iteration": 2.9122390747070312 }, { "auxiliary_loss_clip": 0.01149502, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.05737543, "balance_loss_mlp": 1.02676702, "epoch": 0.15229219900796634, "flos": 25556798563200.0, "grad_norm": 1.9533698136575197, "language_loss": 0.87679356, "learning_rate": 3.845158914395105e-06, "loss": 0.89874816, "num_input_tokens_seen": 55057135, "step": 2533, "time_per_iteration": 2.7987985610961914 }, { "auxiliary_loss_clip": 0.01117955, "auxiliary_loss_mlp": 0.01048672, "balance_loss_clip": 1.05235386, "balance_loss_mlp": 1.02983665, "epoch": 0.1523523222606343, "flos": 18217806520320.0, "grad_norm": 2.391026063452041, "language_loss": 0.78886449, "learning_rate": 3.84500862231636e-06, "loss": 0.81053078, "num_input_tokens_seen": 55075525, "step": 2534, "time_per_iteration": 2.7587406635284424 }, { "auxiliary_loss_clip": 0.01164218, "auxiliary_loss_mlp": 0.0104722, "balance_loss_clip": 1.05609345, "balance_loss_mlp": 1.0270381, "epoch": 0.15241244551330227, "flos": 13260087642240.0, "grad_norm": 2.689732363294508, "language_loss": 0.76809752, "learning_rate": 3.844858260274702e-06, "loss": 0.79021192, "num_input_tokens_seen": 55090845, "step": 2535, "time_per_iteration": 2.7494406700134277 }, { "auxiliary_loss_clip": 0.01142628, "auxiliary_loss_mlp": 0.01042905, "balance_loss_clip": 1.05345285, "balance_loss_mlp": 1.02401042, "epoch": 0.15247256876597023, "flos": 19715568854400.0, "grad_norm": 2.2235871255319446, "language_loss": 0.78301942, "learning_rate": 3.844707828275835e-06, "loss": 0.80487478, "num_input_tokens_seen": 55108750, "step": 2536, "time_per_iteration": 2.738638401031494 }, { "auxiliary_loss_clip": 0.01128919, "auxiliary_loss_mlp": 0.0105368, "balance_loss_clip": 1.05349088, "balance_loss_mlp": 1.03497589, "epoch": 0.1525326920186382, "flos": 20375858615040.0, "grad_norm": 2.311649941233105, "language_loss": 0.75824189, "learning_rate": 3.844557326325461e-06, "loss": 0.78006792, "num_input_tokens_seen": 55126750, "step": 2537, "time_per_iteration": 2.632373809814453 }, { "auxiliary_loss_clip": 0.0114911, "auxiliary_loss_mlp": 0.01041421, "balance_loss_clip": 1.05675745, "balance_loss_mlp": 1.02331281, "epoch": 0.15259281527130616, "flos": 13589963170560.0, "grad_norm": 2.193148723631548, "language_loss": 0.77737647, "learning_rate": 3.8444067544292896e-06, "loss": 0.79928178, "num_input_tokens_seen": 55144690, "step": 2538, "time_per_iteration": 2.6835639476776123 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.05477905, "balance_loss_mlp": 1.02480412, "epoch": 0.15265293852397416, "flos": 22860374446080.0, "grad_norm": 2.951423477379744, "language_loss": 0.89502335, "learning_rate": 3.844256112593029e-06, "loss": 0.91653961, "num_input_tokens_seen": 55166055, "step": 2539, "time_per_iteration": 2.7825794219970703 }, { "auxiliary_loss_clip": 0.01142581, "auxiliary_loss_mlp": 0.01045856, "balance_loss_clip": 1.05367279, "balance_loss_mlp": 1.02721143, "epoch": 0.15271306177664212, "flos": 29238108670080.0, "grad_norm": 2.1073423273657044, "language_loss": 0.93423879, "learning_rate": 3.844105400822391e-06, "loss": 0.95612311, "num_input_tokens_seen": 55186285, "step": 2540, "time_per_iteration": 2.717541456222534 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01041863, "balance_loss_clip": 1.05122495, "balance_loss_mlp": 1.0240885, "epoch": 0.1527731850293101, "flos": 31246269310080.0, "grad_norm": 2.084754505375857, "language_loss": 0.75217843, "learning_rate": 3.843954619123092e-06, "loss": 0.77391309, "num_input_tokens_seen": 55207915, "step": 2541, "time_per_iteration": 2.8376123905181885 }, { "auxiliary_loss_clip": 0.01116303, "auxiliary_loss_mlp": 0.01045227, "balance_loss_clip": 1.04877007, "balance_loss_mlp": 1.0268805, "epoch": 0.15283330828197805, "flos": 22382079920640.0, "grad_norm": 2.037290364787748, "language_loss": 0.80996066, "learning_rate": 3.84380376750085e-06, "loss": 0.83157599, "num_input_tokens_seen": 55227860, "step": 2542, "time_per_iteration": 2.7110376358032227 }, { "auxiliary_loss_clip": 0.01160331, "auxiliary_loss_mlp": 0.01048661, "balance_loss_clip": 1.0566076, "balance_loss_mlp": 1.02992105, "epoch": 0.15289343153464602, "flos": 25520133755520.0, "grad_norm": 3.2152362880248857, "language_loss": 0.77796149, "learning_rate": 3.843652845961383e-06, "loss": 0.80005145, "num_input_tokens_seen": 55247330, "step": 2543, "time_per_iteration": 2.674131155014038 }, { "auxiliary_loss_clip": 0.01145565, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.05380869, "balance_loss_mlp": 1.02388239, "epoch": 0.15295355478731398, "flos": 22710016114560.0, "grad_norm": 2.4890924021550918, "language_loss": 0.85898137, "learning_rate": 3.843501854510416e-06, "loss": 0.88085836, "num_input_tokens_seen": 55266195, "step": 2544, "time_per_iteration": 2.685840606689453 }, { "auxiliary_loss_clip": 0.01149904, "auxiliary_loss_mlp": 0.01051141, "balance_loss_clip": 1.05162692, "balance_loss_mlp": 1.03061318, "epoch": 0.15301367803998198, "flos": 23251907669760.0, "grad_norm": 1.9817931887295275, "language_loss": 0.83159137, "learning_rate": 3.843350793153673e-06, "loss": 0.85360181, "num_input_tokens_seen": 55283305, "step": 2545, "time_per_iteration": 2.7415812015533447 }, { "auxiliary_loss_clip": 0.01158976, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.05556524, "balance_loss_mlp": 1.02257705, "epoch": 0.15307380129264994, "flos": 25886279041920.0, "grad_norm": 6.0131413628182, "language_loss": 0.71669161, "learning_rate": 3.843199661896884e-06, "loss": 0.73869026, "num_input_tokens_seen": 55303035, "step": 2546, "time_per_iteration": 2.6626265048980713 }, { "auxiliary_loss_clip": 0.01130357, "auxiliary_loss_mlp": 0.01047635, "balance_loss_clip": 1.05013335, "balance_loss_mlp": 1.02688098, "epoch": 0.1531339245453179, "flos": 46973239205760.0, "grad_norm": 1.6563553629779504, "language_loss": 0.77438712, "learning_rate": 3.843048460745779e-06, "loss": 0.79616702, "num_input_tokens_seen": 55327570, "step": 2547, "time_per_iteration": 4.451423168182373 }, { "auxiliary_loss_clip": 0.01107553, "auxiliary_loss_mlp": 0.01044692, "balance_loss_clip": 1.04845536, "balance_loss_mlp": 1.02517736, "epoch": 0.15319404779798587, "flos": 35882049565440.0, "grad_norm": 2.3544675813743834, "language_loss": 0.74357474, "learning_rate": 3.842897189706092e-06, "loss": 0.7650972, "num_input_tokens_seen": 55351090, "step": 2548, "time_per_iteration": 2.846991539001465 }, { "auxiliary_loss_clip": 0.01138346, "auxiliary_loss_mlp": 0.0105294, "balance_loss_clip": 1.05340147, "balance_loss_mlp": 1.03304434, "epoch": 0.15325417105065384, "flos": 25664638170240.0, "grad_norm": 1.446042531021912, "language_loss": 0.80296385, "learning_rate": 3.842745848783558e-06, "loss": 0.82487667, "num_input_tokens_seen": 55371050, "step": 2549, "time_per_iteration": 5.8849101066589355 }, { "auxiliary_loss_clip": 0.01144858, "auxiliary_loss_mlp": 0.01041292, "balance_loss_clip": 1.05108786, "balance_loss_mlp": 1.02255249, "epoch": 0.1533142943033218, "flos": 18770831291520.0, "grad_norm": 1.6149920159034452, "language_loss": 0.74602014, "learning_rate": 3.842594437983917e-06, "loss": 0.76788169, "num_input_tokens_seen": 55390375, "step": 2550, "time_per_iteration": 2.684868812561035 }, { "auxiliary_loss_clip": 0.01149823, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.05212283, "balance_loss_mlp": 1.02129996, "epoch": 0.15337441755598977, "flos": 23107367341440.0, "grad_norm": 2.33086854575276, "language_loss": 0.76910275, "learning_rate": 3.8424429573129115e-06, "loss": 0.79100841, "num_input_tokens_seen": 55408890, "step": 2551, "time_per_iteration": 4.415414333343506 }, { "auxiliary_loss_clip": 0.01086721, "auxiliary_loss_mlp": 0.01054065, "balance_loss_clip": 1.05333817, "balance_loss_mlp": 1.05116868, "epoch": 0.15343454080865776, "flos": 59861079227520.0, "grad_norm": 0.9493148205555214, "language_loss": 0.5665558, "learning_rate": 3.842291406776283e-06, "loss": 0.5879637, "num_input_tokens_seen": 55463815, "step": 2552, "time_per_iteration": 3.1105730533599854 }, { "auxiliary_loss_clip": 0.011128, "auxiliary_loss_mlp": 0.01039619, "balance_loss_clip": 1.05131924, "balance_loss_mlp": 1.0204618, "epoch": 0.15349466406132573, "flos": 11910887959680.0, "grad_norm": 2.183188616823757, "language_loss": 0.88550794, "learning_rate": 3.84213978637978e-06, "loss": 0.90703207, "num_input_tokens_seen": 55481050, "step": 2553, "time_per_iteration": 2.748298406600952 }, { "auxiliary_loss_clip": 0.01147024, "auxiliary_loss_mlp": 0.01042929, "balance_loss_clip": 1.05247378, "balance_loss_mlp": 1.0232954, "epoch": 0.1535547873139937, "flos": 24096922099200.0, "grad_norm": 1.8094820084348213, "language_loss": 0.7800495, "learning_rate": 3.841988096129152e-06, "loss": 0.80194902, "num_input_tokens_seen": 55500050, "step": 2554, "time_per_iteration": 2.6555569171905518 }, { "auxiliary_loss_clip": 0.01094445, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.04876757, "balance_loss_mlp": 1.03291798, "epoch": 0.15361491056666166, "flos": 17566459246080.0, "grad_norm": 2.372022486587551, "language_loss": 0.77472258, "learning_rate": 3.841836336030151e-06, "loss": 0.79620385, "num_input_tokens_seen": 55518125, "step": 2555, "time_per_iteration": 2.7507212162017822 }, { "auxiliary_loss_clip": 0.01129555, "auxiliary_loss_mlp": 0.01046723, "balance_loss_clip": 1.05400753, "balance_loss_mlp": 1.02873409, "epoch": 0.15367503381932962, "flos": 25046041121280.0, "grad_norm": 1.5517643759455655, "language_loss": 0.77453947, "learning_rate": 3.8416845060885305e-06, "loss": 0.79630232, "num_input_tokens_seen": 55540960, "step": 2556, "time_per_iteration": 2.7947654724121094 }, { "auxiliary_loss_clip": 0.01140725, "auxiliary_loss_mlp": 0.0077646, "balance_loss_clip": 1.05336452, "balance_loss_mlp": 1.00054574, "epoch": 0.15373515707199759, "flos": 21507332008320.0, "grad_norm": 1.8786460244833383, "language_loss": 0.90098578, "learning_rate": 3.84153260631005e-06, "loss": 0.92015761, "num_input_tokens_seen": 55559210, "step": 2557, "time_per_iteration": 2.702029228210449 }, { "auxiliary_loss_clip": 0.01137441, "auxiliary_loss_mlp": 0.01048546, "balance_loss_clip": 1.05146766, "balance_loss_mlp": 1.02862656, "epoch": 0.15379528032466555, "flos": 25994729180160.0, "grad_norm": 2.4046585493240102, "language_loss": 0.7092281, "learning_rate": 3.841380636700468e-06, "loss": 0.73108798, "num_input_tokens_seen": 55578925, "step": 2558, "time_per_iteration": 2.815653085708618 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01045983, "balance_loss_clip": 1.04937947, "balance_loss_mlp": 1.02659965, "epoch": 0.15385540357733354, "flos": 19277315015040.0, "grad_norm": 2.1050139676488535, "language_loss": 0.92165422, "learning_rate": 3.841228597265548e-06, "loss": 0.94343007, "num_input_tokens_seen": 55597255, "step": 2559, "time_per_iteration": 2.7363967895507812 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.01057878, "balance_loss_clip": 1.05492043, "balance_loss_mlp": 1.03711152, "epoch": 0.1539155268300015, "flos": 28549126920960.0, "grad_norm": 2.149412909113977, "language_loss": 0.63330692, "learning_rate": 3.841076488011055e-06, "loss": 0.65526068, "num_input_tokens_seen": 55619515, "step": 2560, "time_per_iteration": 2.811800003051758 }, { "auxiliary_loss_clip": 0.01132154, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.04914606, "balance_loss_mlp": 1.02416182, "epoch": 0.15397565008266947, "flos": 23547883737600.0, "grad_norm": 2.066473237183783, "language_loss": 0.88155699, "learning_rate": 3.8409243089427574e-06, "loss": 0.90331829, "num_input_tokens_seen": 55640050, "step": 2561, "time_per_iteration": 2.7991089820861816 }, { "auxiliary_loss_clip": 0.0114054, "auxiliary_loss_mlp": 0.01041879, "balance_loss_clip": 1.05085099, "balance_loss_mlp": 1.02380693, "epoch": 0.15403577333533744, "flos": 17129821518720.0, "grad_norm": 1.906051405357337, "language_loss": 0.83117974, "learning_rate": 3.840772060066425e-06, "loss": 0.85300398, "num_input_tokens_seen": 55658695, "step": 2562, "time_per_iteration": 2.6410810947418213 }, { "auxiliary_loss_clip": 0.01128756, "auxiliary_loss_mlp": 0.00778205, "balance_loss_clip": 1.04988563, "balance_loss_mlp": 1.00058532, "epoch": 0.1540958965880054, "flos": 17894503180800.0, "grad_norm": 2.3547297997270906, "language_loss": 0.74647415, "learning_rate": 3.840619741387832e-06, "loss": 0.76554382, "num_input_tokens_seen": 55676340, "step": 2563, "time_per_iteration": 2.6813745498657227 }, { "auxiliary_loss_clip": 0.01116857, "auxiliary_loss_mlp": 0.0104411, "balance_loss_clip": 1.05126941, "balance_loss_mlp": 1.02444029, "epoch": 0.15415601984067337, "flos": 32161057908480.0, "grad_norm": 2.842824767177756, "language_loss": 0.7609179, "learning_rate": 3.8404673529127534e-06, "loss": 0.78252757, "num_input_tokens_seen": 55698890, "step": 2564, "time_per_iteration": 2.832885265350342 }, { "auxiliary_loss_clip": 0.01133461, "auxiliary_loss_mlp": 0.01052887, "balance_loss_clip": 1.05174518, "balance_loss_mlp": 1.03443313, "epoch": 0.15421614309334136, "flos": 24024418496640.0, "grad_norm": 2.0125869911748575, "language_loss": 0.70960921, "learning_rate": 3.840314894646969e-06, "loss": 0.73147273, "num_input_tokens_seen": 55718535, "step": 2565, "time_per_iteration": 2.7352514266967773 }, { "auxiliary_loss_clip": 0.01137766, "auxiliary_loss_mlp": 0.01046908, "balance_loss_clip": 1.04731965, "balance_loss_mlp": 1.02787066, "epoch": 0.15427626634600933, "flos": 24386290064640.0, "grad_norm": 2.1021891280826965, "language_loss": 0.71605748, "learning_rate": 3.840162366596259e-06, "loss": 0.73790431, "num_input_tokens_seen": 55738970, "step": 2566, "time_per_iteration": 2.681710720062256 }, { "auxiliary_loss_clip": 0.01150619, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.04834008, "balance_loss_mlp": 1.02271223, "epoch": 0.1543363895986773, "flos": 23331522165120.0, "grad_norm": 1.7167104030167524, "language_loss": 0.84746087, "learning_rate": 3.840009768766408e-06, "loss": 0.86937428, "num_input_tokens_seen": 55759585, "step": 2567, "time_per_iteration": 2.6413686275482178 }, { "auxiliary_loss_clip": 0.01104646, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.04447246, "balance_loss_mlp": 1.03164053, "epoch": 0.15439651285134526, "flos": 24274284480000.0, "grad_norm": 2.9101336164483014, "language_loss": 0.78074998, "learning_rate": 3.839857101163202e-06, "loss": 0.80229992, "num_input_tokens_seen": 55779250, "step": 2568, "time_per_iteration": 2.7385261058807373 }, { "auxiliary_loss_clip": 0.01121993, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.04715753, "balance_loss_mlp": 1.01684201, "epoch": 0.15445663610401322, "flos": 22456163721600.0, "grad_norm": 1.852436867559063, "language_loss": 0.6991998, "learning_rate": 3.83970436379243e-06, "loss": 0.72079051, "num_input_tokens_seen": 55800470, "step": 2569, "time_per_iteration": 2.746974229812622 }, { "auxiliary_loss_clip": 0.01124209, "auxiliary_loss_mlp": 0.01040299, "balance_loss_clip": 1.04695952, "balance_loss_mlp": 1.02178574, "epoch": 0.1545167593566812, "flos": 22049510872320.0, "grad_norm": 1.7212875994527412, "language_loss": 0.76482332, "learning_rate": 3.839551556659884e-06, "loss": 0.78646845, "num_input_tokens_seen": 55817795, "step": 2570, "time_per_iteration": 2.7470619678497314 }, { "auxiliary_loss_clip": 0.01137702, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.04993737, "balance_loss_mlp": 1.0192852, "epoch": 0.15457688260934915, "flos": 19318253541120.0, "grad_norm": 2.5033166184578066, "language_loss": 0.77997506, "learning_rate": 3.839398679771359e-06, "loss": 0.80173767, "num_input_tokens_seen": 55836125, "step": 2571, "time_per_iteration": 2.692863702774048 }, { "auxiliary_loss_clip": 0.0113208, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.0498451, "balance_loss_mlp": 1.02704597, "epoch": 0.15463700586201715, "flos": 24133981956480.0, "grad_norm": 4.3242380509309015, "language_loss": 0.82932413, "learning_rate": 3.839245733132652e-06, "loss": 0.85109681, "num_input_tokens_seen": 55855280, "step": 2572, "time_per_iteration": 2.8341822624206543 }, { "auxiliary_loss_clip": 0.01156188, "auxiliary_loss_mlp": 0.01042592, "balance_loss_clip": 1.05181205, "balance_loss_mlp": 1.02383995, "epoch": 0.1546971291146851, "flos": 22420935457920.0, "grad_norm": 1.5874704718869805, "language_loss": 0.90373385, "learning_rate": 3.839092716749563e-06, "loss": 0.92572165, "num_input_tokens_seen": 55875695, "step": 2573, "time_per_iteration": 2.740121364593506 }, { "auxiliary_loss_clip": 0.01088424, "auxiliary_loss_mlp": 0.01049893, "balance_loss_clip": 1.04328668, "balance_loss_mlp": 1.03003311, "epoch": 0.15475725236735308, "flos": 17530225401600.0, "grad_norm": 1.596795561637076, "language_loss": 0.70298707, "learning_rate": 3.838939630627893e-06, "loss": 0.72437024, "num_input_tokens_seen": 55894575, "step": 2574, "time_per_iteration": 2.7629144191741943 }, { "auxiliary_loss_clip": 0.01127537, "auxiliary_loss_mlp": 0.01045732, "balance_loss_clip": 1.04714394, "balance_loss_mlp": 1.02509642, "epoch": 0.15481737562002104, "flos": 22561740771840.0, "grad_norm": 6.018921028505516, "language_loss": 0.82426423, "learning_rate": 3.838786474773448e-06, "loss": 0.84599686, "num_input_tokens_seen": 55912855, "step": 2575, "time_per_iteration": 2.656783103942871 }, { "auxiliary_loss_clip": 0.01127415, "auxiliary_loss_mlp": 0.01043354, "balance_loss_clip": 1.04681587, "balance_loss_mlp": 1.02584219, "epoch": 0.154877498872689, "flos": 24900567039360.0, "grad_norm": 1.8376318938002576, "language_loss": 0.85038638, "learning_rate": 3.838633249192036e-06, "loss": 0.87209404, "num_input_tokens_seen": 55932375, "step": 2576, "time_per_iteration": 2.648484230041504 }, { "auxiliary_loss_clip": 0.01152547, "auxiliary_loss_mlp": 0.01043401, "balance_loss_clip": 1.04872847, "balance_loss_mlp": 1.02499545, "epoch": 0.15493762212535697, "flos": 28147501975680.0, "grad_norm": 1.8027999188827728, "language_loss": 0.82271254, "learning_rate": 3.838479953889465e-06, "loss": 0.84467208, "num_input_tokens_seen": 55953970, "step": 2577, "time_per_iteration": 2.6355643272399902 }, { "auxiliary_loss_clip": 0.01126009, "auxiliary_loss_mlp": 0.01049018, "balance_loss_clip": 1.05147958, "balance_loss_mlp": 1.02984881, "epoch": 0.15499774537802496, "flos": 25411073086080.0, "grad_norm": 2.1677069711314463, "language_loss": 0.76556361, "learning_rate": 3.8383265888715525e-06, "loss": 0.78731394, "num_input_tokens_seen": 55973120, "step": 2578, "time_per_iteration": 2.649043560028076 }, { "auxiliary_loss_clip": 0.01123677, "auxiliary_loss_mlp": 0.01044461, "balance_loss_clip": 1.05155993, "balance_loss_mlp": 1.0253042, "epoch": 0.15505786863069293, "flos": 22091562720000.0, "grad_norm": 1.9614380224881987, "language_loss": 0.82443559, "learning_rate": 3.83817315414411e-06, "loss": 0.8461169, "num_input_tokens_seen": 55993260, "step": 2579, "time_per_iteration": 2.62631893157959 }, { "auxiliary_loss_clip": 0.01143904, "auxiliary_loss_mlp": 0.01044324, "balance_loss_clip": 1.05856657, "balance_loss_mlp": 1.02556014, "epoch": 0.1551179918833609, "flos": 18917131386240.0, "grad_norm": 2.610374735790095, "language_loss": 0.80465376, "learning_rate": 3.838019649712958e-06, "loss": 0.82653606, "num_input_tokens_seen": 56012130, "step": 2580, "time_per_iteration": 2.6512253284454346 }, { "auxiliary_loss_clip": 0.0107737, "auxiliary_loss_mlp": 0.01006304, "balance_loss_clip": 1.04551053, "balance_loss_mlp": 1.00360954, "epoch": 0.15517811513602886, "flos": 66239172587520.0, "grad_norm": 0.842131683094019, "language_loss": 0.58823448, "learning_rate": 3.8378660755839166e-06, "loss": 0.60907125, "num_input_tokens_seen": 56079045, "step": 2581, "time_per_iteration": 3.357855796813965 }, { "auxiliary_loss_clip": 0.01108206, "auxiliary_loss_mlp": 0.01047031, "balance_loss_clip": 1.04392648, "balance_loss_mlp": 1.0249418, "epoch": 0.15523823838869683, "flos": 24021078531840.0, "grad_norm": 1.9584677228939371, "language_loss": 0.84773678, "learning_rate": 3.8377124317628095e-06, "loss": 0.86928916, "num_input_tokens_seen": 56098745, "step": 2582, "time_per_iteration": 2.727062702178955 }, { "auxiliary_loss_clip": 0.01144131, "auxiliary_loss_mlp": 0.01051911, "balance_loss_clip": 1.05233002, "balance_loss_mlp": 1.03175235, "epoch": 0.1552983616413648, "flos": 20485062938880.0, "grad_norm": 2.466663791870015, "language_loss": 0.79050052, "learning_rate": 3.8375587182554625e-06, "loss": 0.81246096, "num_input_tokens_seen": 56117655, "step": 2583, "time_per_iteration": 2.664794683456421 }, { "auxiliary_loss_clip": 0.01139818, "auxiliary_loss_mlp": 0.01054771, "balance_loss_clip": 1.04957032, "balance_loss_mlp": 1.03252697, "epoch": 0.15535848489403276, "flos": 32123710742400.0, "grad_norm": 1.8743170599575527, "language_loss": 0.76320136, "learning_rate": 3.837404935067705e-06, "loss": 0.78514719, "num_input_tokens_seen": 56141960, "step": 2584, "time_per_iteration": 2.757392168045044 }, { "auxiliary_loss_clip": 0.01137324, "auxiliary_loss_mlp": 0.01042496, "balance_loss_clip": 1.04884958, "balance_loss_mlp": 1.02302885, "epoch": 0.15541860814670075, "flos": 19098444263040.0, "grad_norm": 1.6493041410587026, "language_loss": 0.75269651, "learning_rate": 3.837251082205368e-06, "loss": 0.77449471, "num_input_tokens_seen": 56161430, "step": 2585, "time_per_iteration": 2.6497461795806885 }, { "auxiliary_loss_clip": 0.01116144, "auxiliary_loss_mlp": 0.01042356, "balance_loss_clip": 1.04862189, "balance_loss_mlp": 1.02321053, "epoch": 0.1554787313993687, "flos": 19172097100800.0, "grad_norm": 2.068989677221064, "language_loss": 0.61187196, "learning_rate": 3.837097159674286e-06, "loss": 0.63345695, "num_input_tokens_seen": 56179390, "step": 2586, "time_per_iteration": 2.697852373123169 }, { "auxiliary_loss_clip": 0.01129408, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.04842281, "balance_loss_mlp": 1.02341127, "epoch": 0.15553885465203668, "flos": 16143822207360.0, "grad_norm": 1.8484108176722505, "language_loss": 0.81318939, "learning_rate": 3.836943167480296e-06, "loss": 0.83490539, "num_input_tokens_seen": 56198020, "step": 2587, "time_per_iteration": 4.212551593780518 }, { "auxiliary_loss_clip": 0.01160891, "auxiliary_loss_mlp": 0.01054822, "balance_loss_clip": 1.05309868, "balance_loss_mlp": 1.03325701, "epoch": 0.15559897790470464, "flos": 25337779384320.0, "grad_norm": 1.866779523391448, "language_loss": 0.88716942, "learning_rate": 3.836789105629236e-06, "loss": 0.90932655, "num_input_tokens_seen": 56218165, "step": 2588, "time_per_iteration": 4.192267894744873 }, { "auxiliary_loss_clip": 0.01094981, "auxiliary_loss_mlp": 0.01052123, "balance_loss_clip": 1.04558384, "balance_loss_mlp": 1.03164268, "epoch": 0.1556591011573726, "flos": 23148772744320.0, "grad_norm": 2.018423224363699, "language_loss": 0.64624381, "learning_rate": 3.83663497412695e-06, "loss": 0.66771483, "num_input_tokens_seen": 56237160, "step": 2589, "time_per_iteration": 4.303871154785156 }, { "auxiliary_loss_clip": 0.01104407, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.04520249, "balance_loss_mlp": 1.02123344, "epoch": 0.15571922441004057, "flos": 25370888745600.0, "grad_norm": 1.784618480549341, "language_loss": 0.82832813, "learning_rate": 3.836480772979281e-06, "loss": 0.84979194, "num_input_tokens_seen": 56257610, "step": 2590, "time_per_iteration": 4.460350751876831 }, { "auxiliary_loss_clip": 0.011248, "auxiliary_loss_mlp": 0.01047287, "balance_loss_clip": 1.05032134, "balance_loss_mlp": 1.02694952, "epoch": 0.15577934766270854, "flos": 14501375890560.0, "grad_norm": 2.6687659077907484, "language_loss": 0.78766, "learning_rate": 3.836326502192077e-06, "loss": 0.80938083, "num_input_tokens_seen": 56275215, "step": 2591, "time_per_iteration": 2.73305606842041 }, { "auxiliary_loss_clip": 0.01143879, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.05174232, "balance_loss_mlp": 1.03137255, "epoch": 0.15583947091537653, "flos": 37414537372800.0, "grad_norm": 2.0331558547393054, "language_loss": 0.65025747, "learning_rate": 3.836172161771189e-06, "loss": 0.67218637, "num_input_tokens_seen": 56297130, "step": 2592, "time_per_iteration": 2.8582632541656494 }, { "auxiliary_loss_clip": 0.01136043, "auxiliary_loss_mlp": 0.01052096, "balance_loss_clip": 1.05417228, "balance_loss_mlp": 1.0322001, "epoch": 0.1558995941680445, "flos": 21834729498240.0, "grad_norm": 2.311634250072179, "language_loss": 0.82506329, "learning_rate": 3.836017751722467e-06, "loss": 0.84694475, "num_input_tokens_seen": 56314995, "step": 2593, "time_per_iteration": 2.7230453491210938 }, { "auxiliary_loss_clip": 0.01142565, "auxiliary_loss_mlp": 0.01046037, "balance_loss_clip": 1.05237365, "balance_loss_mlp": 1.02676034, "epoch": 0.15595971742071246, "flos": 19792633484160.0, "grad_norm": 2.778410683125911, "language_loss": 0.73220694, "learning_rate": 3.8358632720517695e-06, "loss": 0.75409293, "num_input_tokens_seen": 56334005, "step": 2594, "time_per_iteration": 2.708063840866089 }, { "auxiliary_loss_clip": 0.01117989, "auxiliary_loss_mlp": 0.01040106, "balance_loss_clip": 1.0453043, "balance_loss_mlp": 1.02077007, "epoch": 0.15601984067338043, "flos": 26722135503360.0, "grad_norm": 2.1444704922101105, "language_loss": 0.81569934, "learning_rate": 3.835708722764952e-06, "loss": 0.83728027, "num_input_tokens_seen": 56353795, "step": 2595, "time_per_iteration": 2.716334581375122 }, { "auxiliary_loss_clip": 0.01155359, "auxiliary_loss_mlp": 0.01043269, "balance_loss_clip": 1.05093551, "balance_loss_mlp": 1.0238502, "epoch": 0.1560799639260484, "flos": 18369278173440.0, "grad_norm": 1.8943501893042642, "language_loss": 0.86674929, "learning_rate": 3.835554103867876e-06, "loss": 0.88873553, "num_input_tokens_seen": 56373195, "step": 2596, "time_per_iteration": 2.5947446823120117 }, { "auxiliary_loss_clip": 0.01144729, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.05225515, "balance_loss_mlp": 1.02360725, "epoch": 0.15614008717871636, "flos": 22598980197120.0, "grad_norm": 1.8059460934517404, "language_loss": 0.68772388, "learning_rate": 3.835399415366404e-06, "loss": 0.70959222, "num_input_tokens_seen": 56391525, "step": 2597, "time_per_iteration": 2.8101041316986084 }, { "auxiliary_loss_clip": 0.01130069, "auxiliary_loss_mlp": 0.01050835, "balance_loss_clip": 1.05409336, "balance_loss_mlp": 1.03165436, "epoch": 0.15620021043138435, "flos": 22746860490240.0, "grad_norm": 1.9103744906429732, "language_loss": 0.79860938, "learning_rate": 3.8352446572664035e-06, "loss": 0.82041842, "num_input_tokens_seen": 56410715, "step": 2598, "time_per_iteration": 2.695117950439453 }, { "auxiliary_loss_clip": 0.0112861, "auxiliary_loss_mlp": 0.00776118, "balance_loss_clip": 1.04750216, "balance_loss_mlp": 1.0006249, "epoch": 0.15626033368405232, "flos": 13114936782720.0, "grad_norm": 3.1104681024188827, "language_loss": 0.83092594, "learning_rate": 3.8350898295737405e-06, "loss": 0.84997326, "num_input_tokens_seen": 56429170, "step": 2599, "time_per_iteration": 2.665703773498535 }, { "auxiliary_loss_clip": 0.01160593, "auxiliary_loss_mlp": 0.0105002, "balance_loss_clip": 1.05274248, "balance_loss_mlp": 1.02924192, "epoch": 0.15632045693672028, "flos": 16472297105280.0, "grad_norm": 2.2910683048406266, "language_loss": 0.81530893, "learning_rate": 3.834934932294287e-06, "loss": 0.83741504, "num_input_tokens_seen": 56445685, "step": 2600, "time_per_iteration": 2.615651845932007 }, { "auxiliary_loss_clip": 0.01161023, "auxiliary_loss_mlp": 0.00776671, "balance_loss_clip": 1.05562234, "balance_loss_mlp": 1.00063944, "epoch": 0.15638058018938825, "flos": 20850346298880.0, "grad_norm": 1.7832591469657297, "language_loss": 0.88511437, "learning_rate": 3.834779965433917e-06, "loss": 0.90449131, "num_input_tokens_seen": 56465900, "step": 2601, "time_per_iteration": 2.6833529472351074 }, { "auxiliary_loss_clip": 0.0116257, "auxiliary_loss_mlp": 0.0106307, "balance_loss_clip": 1.05569744, "balance_loss_mlp": 1.04120743, "epoch": 0.1564407034420562, "flos": 21872220318720.0, "grad_norm": 1.9421054688538308, "language_loss": 0.78707534, "learning_rate": 3.834624928998508e-06, "loss": 0.80933177, "num_input_tokens_seen": 56485020, "step": 2602, "time_per_iteration": 2.6296608448028564 }, { "auxiliary_loss_clip": 0.01126653, "auxiliary_loss_mlp": 0.01043676, "balance_loss_clip": 1.05035329, "balance_loss_mlp": 1.02419758, "epoch": 0.15650082669472418, "flos": 21834549930240.0, "grad_norm": 1.8230718276715763, "language_loss": 0.74029547, "learning_rate": 3.8344698229939376e-06, "loss": 0.76199877, "num_input_tokens_seen": 56505205, "step": 2603, "time_per_iteration": 2.744508743286133 }, { "auxiliary_loss_clip": 0.01143305, "auxiliary_loss_mlp": 0.01051047, "balance_loss_clip": 1.04820418, "balance_loss_mlp": 1.03112721, "epoch": 0.15656094994739214, "flos": 13800542653440.0, "grad_norm": 4.041164356714064, "language_loss": 0.87723601, "learning_rate": 3.8343146474260865e-06, "loss": 0.89917958, "num_input_tokens_seen": 56521495, "step": 2604, "time_per_iteration": 2.682457447052002 }, { "auxiliary_loss_clip": 0.01145351, "auxiliary_loss_mlp": 0.01044759, "balance_loss_clip": 1.04976749, "balance_loss_mlp": 1.0256021, "epoch": 0.15662107320006013, "flos": 27308197808640.0, "grad_norm": 2.260429022209425, "language_loss": 0.8573193, "learning_rate": 3.834159402300841e-06, "loss": 0.87922043, "num_input_tokens_seen": 56540665, "step": 2605, "time_per_iteration": 2.7724974155426025 }, { "auxiliary_loss_clip": 0.0115108, "auxiliary_loss_mlp": 0.01047256, "balance_loss_clip": 1.05181313, "balance_loss_mlp": 1.02676356, "epoch": 0.1566811964527281, "flos": 26685075646080.0, "grad_norm": 1.7309636492693905, "language_loss": 0.73101914, "learning_rate": 3.834004087624087e-06, "loss": 0.75300246, "num_input_tokens_seen": 56560805, "step": 2606, "time_per_iteration": 2.7490081787109375 }, { "auxiliary_loss_clip": 0.01158388, "auxiliary_loss_mlp": 0.01049752, "balance_loss_clip": 1.0552665, "balance_loss_mlp": 1.03165627, "epoch": 0.15674131970539606, "flos": 16103422385280.0, "grad_norm": 2.968092109370304, "language_loss": 0.76497948, "learning_rate": 3.8338487034017145e-06, "loss": 0.78706092, "num_input_tokens_seen": 56576335, "step": 2607, "time_per_iteration": 2.6597230434417725 }, { "auxiliary_loss_clip": 0.01120645, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.05131412, "balance_loss_mlp": 1.0284934, "epoch": 0.15680144295806403, "flos": 19169690889600.0, "grad_norm": 1.7981763092074996, "language_loss": 0.82107675, "learning_rate": 3.833693249639615e-06, "loss": 0.84275496, "num_input_tokens_seen": 56595880, "step": 2608, "time_per_iteration": 2.7072103023529053 }, { "auxiliary_loss_clip": 0.0112834, "auxiliary_loss_mlp": 0.01045106, "balance_loss_clip": 1.04685056, "balance_loss_mlp": 1.02436399, "epoch": 0.156861566210732, "flos": 20813430096000.0, "grad_norm": 1.6817301031159713, "language_loss": 0.72335941, "learning_rate": 3.833537726343684e-06, "loss": 0.74509382, "num_input_tokens_seen": 56615130, "step": 2609, "time_per_iteration": 2.690690755844116 }, { "auxiliary_loss_clip": 0.01143972, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 1.04901087, "balance_loss_mlp": 1.01756072, "epoch": 0.15692168946339996, "flos": 20047922421120.0, "grad_norm": 5.132438477880424, "language_loss": 0.72317064, "learning_rate": 3.833382133519818e-06, "loss": 0.74497753, "num_input_tokens_seen": 56634005, "step": 2610, "time_per_iteration": 2.6515614986419678 }, { "auxiliary_loss_clip": 0.01159588, "auxiliary_loss_mlp": 0.01051513, "balance_loss_clip": 1.05216432, "balance_loss_mlp": 1.03063977, "epoch": 0.15698181271606793, "flos": 21398019943680.0, "grad_norm": 2.0600295188113935, "language_loss": 0.72915608, "learning_rate": 3.833226471173919e-06, "loss": 0.75126708, "num_input_tokens_seen": 56653480, "step": 2611, "time_per_iteration": 2.630988359451294 }, { "auxiliary_loss_clip": 0.01141924, "auxiliary_loss_mlp": 0.01042538, "balance_loss_clip": 1.04917872, "balance_loss_mlp": 1.0231905, "epoch": 0.15704193596873592, "flos": 20845785271680.0, "grad_norm": 2.0339762399532186, "language_loss": 0.70766544, "learning_rate": 3.833070739311887e-06, "loss": 0.72951007, "num_input_tokens_seen": 56672270, "step": 2612, "time_per_iteration": 2.6569461822509766 }, { "auxiliary_loss_clip": 0.01116284, "auxiliary_loss_mlp": 0.01051299, "balance_loss_clip": 1.04844582, "balance_loss_mlp": 1.03221321, "epoch": 0.15710205922140388, "flos": 21762908254080.0, "grad_norm": 1.9704781930994688, "language_loss": 0.76294881, "learning_rate": 3.83291493793963e-06, "loss": 0.78462464, "num_input_tokens_seen": 56691510, "step": 2613, "time_per_iteration": 2.7188539505004883 }, { "auxiliary_loss_clip": 0.01115155, "auxiliary_loss_mlp": 0.01049301, "balance_loss_clip": 1.04504919, "balance_loss_mlp": 1.02956033, "epoch": 0.15716218247407185, "flos": 25007760201600.0, "grad_norm": 2.137998057111896, "language_loss": 0.65944499, "learning_rate": 3.832759067063055e-06, "loss": 0.68108952, "num_input_tokens_seen": 56712230, "step": 2614, "time_per_iteration": 2.7550084590911865 }, { "auxiliary_loss_clip": 0.01151987, "auxiliary_loss_mlp": 0.01044173, "balance_loss_clip": 1.05387104, "balance_loss_mlp": 1.02374101, "epoch": 0.1572223057267398, "flos": 20191780391040.0, "grad_norm": 2.2755662506820915, "language_loss": 0.75204211, "learning_rate": 3.832603126688072e-06, "loss": 0.77400374, "num_input_tokens_seen": 56727490, "step": 2615, "time_per_iteration": 2.683225154876709 }, { "auxiliary_loss_clip": 0.01138545, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.05209839, "balance_loss_mlp": 1.03078008, "epoch": 0.15728242897940778, "flos": 20959514709120.0, "grad_norm": 2.581872009488739, "language_loss": 0.73064095, "learning_rate": 3.832447116820594e-06, "loss": 0.75253528, "num_input_tokens_seen": 56747385, "step": 2616, "time_per_iteration": 2.6660919189453125 }, { "auxiliary_loss_clip": 0.01130717, "auxiliary_loss_mlp": 0.01047511, "balance_loss_clip": 1.04999971, "balance_loss_mlp": 1.02794933, "epoch": 0.15734255223207574, "flos": 23038275530880.0, "grad_norm": 2.813587490853999, "language_loss": 0.72425079, "learning_rate": 3.832291037466539e-06, "loss": 0.74603307, "num_input_tokens_seen": 56768055, "step": 2617, "time_per_iteration": 2.768561363220215 }, { "auxiliary_loss_clip": 0.01138315, "auxiliary_loss_mlp": 0.0104637, "balance_loss_clip": 1.04947805, "balance_loss_mlp": 1.02548432, "epoch": 0.15740267548474374, "flos": 20551281661440.0, "grad_norm": 2.3222819484870016, "language_loss": 0.74358094, "learning_rate": 3.8321348886318235e-06, "loss": 0.76542777, "num_input_tokens_seen": 56785110, "step": 2618, "time_per_iteration": 2.66121768951416 }, { "auxiliary_loss_clip": 0.01162954, "auxiliary_loss_mlp": 0.01046178, "balance_loss_clip": 1.05417252, "balance_loss_mlp": 1.02526867, "epoch": 0.1574627987374117, "flos": 22666922772480.0, "grad_norm": 1.8808629075569874, "language_loss": 0.78896272, "learning_rate": 3.8319786703223695e-06, "loss": 0.81105405, "num_input_tokens_seen": 56804975, "step": 2619, "time_per_iteration": 2.6743338108062744 }, { "auxiliary_loss_clip": 0.01126081, "auxiliary_loss_mlp": 0.01055551, "balance_loss_clip": 1.05046356, "balance_loss_mlp": 1.03576207, "epoch": 0.15752292199007967, "flos": 16800664262400.0, "grad_norm": 1.9082963728737496, "language_loss": 0.76517296, "learning_rate": 3.831822382544101e-06, "loss": 0.78698927, "num_input_tokens_seen": 56822470, "step": 2620, "time_per_iteration": 2.6481080055236816 }, { "auxiliary_loss_clip": 0.01136128, "auxiliary_loss_mlp": 0.0104575, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 1.02488887, "epoch": 0.15758304524274763, "flos": 29826002568960.0, "grad_norm": 1.6603432400664486, "language_loss": 0.7136035, "learning_rate": 3.831666025302944e-06, "loss": 0.73542225, "num_input_tokens_seen": 56842100, "step": 2621, "time_per_iteration": 2.70985746383667 }, { "auxiliary_loss_clip": 0.01103274, "auxiliary_loss_mlp": 0.01052522, "balance_loss_clip": 1.04624665, "balance_loss_mlp": 1.02921629, "epoch": 0.1576431684954156, "flos": 53577426723840.0, "grad_norm": 2.1843515622778624, "language_loss": 0.72136736, "learning_rate": 3.831509598604828e-06, "loss": 0.74292529, "num_input_tokens_seen": 56865920, "step": 2622, "time_per_iteration": 3.024561643600464 }, { "auxiliary_loss_clip": 0.01095163, "auxiliary_loss_mlp": 0.01043948, "balance_loss_clip": 1.04474711, "balance_loss_mlp": 1.02464843, "epoch": 0.15770329174808356, "flos": 20813609664000.0, "grad_norm": 1.6586715789846178, "language_loss": 0.87637675, "learning_rate": 3.831353102455684e-06, "loss": 0.8977679, "num_input_tokens_seen": 56885265, "step": 2623, "time_per_iteration": 2.9600114822387695 }, { "auxiliary_loss_clip": 0.01158714, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.05476475, "balance_loss_mlp": 1.02564478, "epoch": 0.15776341500075153, "flos": 24974004395520.0, "grad_norm": 1.6915331173398198, "language_loss": 0.81600082, "learning_rate": 3.831196536861448e-06, "loss": 0.83803129, "num_input_tokens_seen": 56906710, "step": 2624, "time_per_iteration": 2.6621103286743164 }, { "auxiliary_loss_clip": 0.01122344, "auxiliary_loss_mlp": 0.01049423, "balance_loss_clip": 1.04776418, "balance_loss_mlp": 1.02990842, "epoch": 0.15782353825341952, "flos": 21907915459200.0, "grad_norm": 2.879465237309773, "language_loss": 0.79977828, "learning_rate": 3.831039901828054e-06, "loss": 0.82149595, "num_input_tokens_seen": 56924275, "step": 2625, "time_per_iteration": 2.7291064262390137 }, { "auxiliary_loss_clip": 0.01157938, "auxiliary_loss_mlp": 0.01046203, "balance_loss_clip": 1.05403268, "balance_loss_mlp": 1.02857196, "epoch": 0.15788366150608749, "flos": 26177191292160.0, "grad_norm": 2.133783972400447, "language_loss": 0.80332482, "learning_rate": 3.830883197361445e-06, "loss": 0.8253662, "num_input_tokens_seen": 56941525, "step": 2626, "time_per_iteration": 4.252760171890259 }, { "auxiliary_loss_clip": 0.01102762, "auxiliary_loss_mlp": 0.01057658, "balance_loss_clip": 1.05214024, "balance_loss_mlp": 1.03512752, "epoch": 0.15794378475875545, "flos": 27709822753920.0, "grad_norm": 3.9802810067864045, "language_loss": 0.73636395, "learning_rate": 3.830726423467561e-06, "loss": 0.75796819, "num_input_tokens_seen": 56962145, "step": 2627, "time_per_iteration": 4.328871250152588 }, { "auxiliary_loss_clip": 0.01117433, "auxiliary_loss_mlp": 0.01055032, "balance_loss_clip": 1.0503006, "balance_loss_mlp": 1.0351001, "epoch": 0.15800390801142342, "flos": 12130158533760.0, "grad_norm": 2.0211273696228216, "language_loss": 0.84589541, "learning_rate": 3.830569580152348e-06, "loss": 0.86762005, "num_input_tokens_seen": 56977505, "step": 2628, "time_per_iteration": 2.6785013675689697 }, { "auxiliary_loss_clip": 0.01129476, "auxiliary_loss_mlp": 0.01040858, "balance_loss_clip": 1.05065978, "balance_loss_mlp": 1.02308416, "epoch": 0.15806403126409138, "flos": 20704728562560.0, "grad_norm": 1.897214582222077, "language_loss": 0.76437485, "learning_rate": 3.830412667421752e-06, "loss": 0.78607821, "num_input_tokens_seen": 56996770, "step": 2629, "time_per_iteration": 4.2878499031066895 }, { "auxiliary_loss_clip": 0.01143973, "auxiliary_loss_mlp": 0.01046449, "balance_loss_clip": 1.0529623, "balance_loss_mlp": 1.02675569, "epoch": 0.15812415451675935, "flos": 17821712269440.0, "grad_norm": 2.252423233454998, "language_loss": 0.73337436, "learning_rate": 3.8302556852817245e-06, "loss": 0.75527859, "num_input_tokens_seen": 57014970, "step": 2630, "time_per_iteration": 4.253108263015747 }, { "auxiliary_loss_clip": 0.01156261, "auxiliary_loss_mlp": 0.01045602, "balance_loss_clip": 1.05644512, "balance_loss_mlp": 1.02615929, "epoch": 0.15818427776942734, "flos": 20084048524800.0, "grad_norm": 2.390369083551665, "language_loss": 0.83678091, "learning_rate": 3.8300986337382184e-06, "loss": 0.85879952, "num_input_tokens_seen": 57034045, "step": 2631, "time_per_iteration": 2.6145882606506348 }, { "auxiliary_loss_clip": 0.01159092, "auxiliary_loss_mlp": 0.01045772, "balance_loss_clip": 1.05313432, "balance_loss_mlp": 1.02746117, "epoch": 0.1582444010220953, "flos": 21214911386880.0, "grad_norm": 1.8755653224160422, "language_loss": 0.78415525, "learning_rate": 3.8299415127971895e-06, "loss": 0.80620384, "num_input_tokens_seen": 57053695, "step": 2632, "time_per_iteration": 2.656691551208496 }, { "auxiliary_loss_clip": 0.01151481, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.05574381, "balance_loss_mlp": 1.03769732, "epoch": 0.15830452427476327, "flos": 17858341163520.0, "grad_norm": 2.079450153413421, "language_loss": 0.8301838, "learning_rate": 3.829784322464594e-06, "loss": 0.85227144, "num_input_tokens_seen": 57071290, "step": 2633, "time_per_iteration": 2.622725248336792 }, { "auxiliary_loss_clip": 0.01165069, "auxiliary_loss_mlp": 0.01041545, "balance_loss_clip": 1.05761647, "balance_loss_mlp": 1.02223265, "epoch": 0.15836464752743123, "flos": 24534960456960.0, "grad_norm": 2.1719104392782813, "language_loss": 0.77448404, "learning_rate": 3.829627062746394e-06, "loss": 0.79655015, "num_input_tokens_seen": 57091465, "step": 2634, "time_per_iteration": 2.6383235454559326 }, { "auxiliary_loss_clip": 0.01127407, "auxiliary_loss_mlp": 0.00777775, "balance_loss_clip": 1.05277348, "balance_loss_mlp": 1.00136137, "epoch": 0.1584247707800992, "flos": 20120821073280.0, "grad_norm": 3.5133527254089087, "language_loss": 0.88479185, "learning_rate": 3.829469733648552e-06, "loss": 0.90384364, "num_input_tokens_seen": 57110075, "step": 2635, "time_per_iteration": 2.725924491882324 }, { "auxiliary_loss_clip": 0.01096223, "auxiliary_loss_mlp": 0.01058885, "balance_loss_clip": 1.04816198, "balance_loss_mlp": 1.03847599, "epoch": 0.15848489403276717, "flos": 20375966355840.0, "grad_norm": 2.8627721083207627, "language_loss": 0.75762677, "learning_rate": 3.829312335177034e-06, "loss": 0.77917778, "num_input_tokens_seen": 57128945, "step": 2636, "time_per_iteration": 2.775310516357422 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.05117822, "balance_loss_mlp": 1.02350879, "epoch": 0.15854501728543513, "flos": 39346890359040.0, "grad_norm": 2.388418559522659, "language_loss": 0.71977961, "learning_rate": 3.82915486733781e-06, "loss": 0.74141967, "num_input_tokens_seen": 57152385, "step": 2637, "time_per_iteration": 2.8375279903411865 }, { "auxiliary_loss_clip": 0.0115052, "auxiliary_loss_mlp": 0.01044842, "balance_loss_clip": 1.05661607, "balance_loss_mlp": 1.02640057, "epoch": 0.15860514053810312, "flos": 24864225454080.0, "grad_norm": 2.1640345554565057, "language_loss": 0.78352648, "learning_rate": 3.82899733013685e-06, "loss": 0.80548006, "num_input_tokens_seen": 57172620, "step": 2638, "time_per_iteration": 2.7298176288604736 }, { "auxiliary_loss_clip": 0.01129706, "auxiliary_loss_mlp": 0.01057375, "balance_loss_clip": 1.05311394, "balance_loss_mlp": 1.03715718, "epoch": 0.1586652637907711, "flos": 26177694082560.0, "grad_norm": 2.325769963269074, "language_loss": 0.75845039, "learning_rate": 3.828839723580128e-06, "loss": 0.78032124, "num_input_tokens_seen": 57194680, "step": 2639, "time_per_iteration": 2.7731449604034424 }, { "auxiliary_loss_clip": 0.01104856, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.05350864, "balance_loss_mlp": 1.03772068, "epoch": 0.15872538704343905, "flos": 19792058866560.0, "grad_norm": 2.173238447343554, "language_loss": 0.81319505, "learning_rate": 3.82868204767362e-06, "loss": 0.83481646, "num_input_tokens_seen": 57214675, "step": 2640, "time_per_iteration": 2.8024139404296875 }, { "auxiliary_loss_clip": 0.01135166, "auxiliary_loss_mlp": 0.01054673, "balance_loss_clip": 1.05492401, "balance_loss_mlp": 1.03426492, "epoch": 0.15878551029610702, "flos": 28475366342400.0, "grad_norm": 2.013499020988034, "language_loss": 0.66893363, "learning_rate": 3.828524302423306e-06, "loss": 0.69083202, "num_input_tokens_seen": 57235830, "step": 2641, "time_per_iteration": 2.7519116401672363 }, { "auxiliary_loss_clip": 0.01149448, "auxiliary_loss_mlp": 0.01051949, "balance_loss_clip": 1.05758858, "balance_loss_mlp": 1.0326376, "epoch": 0.15884563354877498, "flos": 24206701040640.0, "grad_norm": 2.139760259286454, "language_loss": 0.7552591, "learning_rate": 3.828366487835167e-06, "loss": 0.77727306, "num_input_tokens_seen": 57255970, "step": 2642, "time_per_iteration": 2.706136465072632 }, { "auxiliary_loss_clip": 0.01156917, "auxiliary_loss_mlp": 0.01042142, "balance_loss_clip": 1.06263423, "balance_loss_mlp": 1.02323556, "epoch": 0.15890575680144295, "flos": 23949795991680.0, "grad_norm": 1.9419610036505286, "language_loss": 0.70564604, "learning_rate": 3.828208603915186e-06, "loss": 0.72763658, "num_input_tokens_seen": 57274435, "step": 2643, "time_per_iteration": 2.682015895843506 }, { "auxiliary_loss_clip": 0.01161783, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.05891204, "balance_loss_mlp": 1.02389312, "epoch": 0.15896588005411091, "flos": 21215019127680.0, "grad_norm": 1.846517711414915, "language_loss": 0.78057045, "learning_rate": 3.828050650669353e-06, "loss": 0.80260473, "num_input_tokens_seen": 57293115, "step": 2644, "time_per_iteration": 2.683790922164917 }, { "auxiliary_loss_clip": 0.01151239, "auxiliary_loss_mlp": 0.01050105, "balance_loss_clip": 1.05701637, "balance_loss_mlp": 1.03154373, "epoch": 0.1590260033067789, "flos": 24352390604160.0, "grad_norm": 3.757920662841351, "language_loss": 0.81961924, "learning_rate": 3.827892628103657e-06, "loss": 0.84163266, "num_input_tokens_seen": 57312565, "step": 2645, "time_per_iteration": 2.698085069656372 }, { "auxiliary_loss_clip": 0.01162748, "auxiliary_loss_mlp": 0.01048492, "balance_loss_clip": 1.05487716, "balance_loss_mlp": 1.02854836, "epoch": 0.15908612655944687, "flos": 32048944583040.0, "grad_norm": 2.056693785790565, "language_loss": 0.69412929, "learning_rate": 3.827734536224087e-06, "loss": 0.71624172, "num_input_tokens_seen": 57333360, "step": 2646, "time_per_iteration": 2.7166528701782227 }, { "auxiliary_loss_clip": 0.01135067, "auxiliary_loss_mlp": 0.01040314, "balance_loss_clip": 1.05435526, "balance_loss_mlp": 1.02223015, "epoch": 0.15914624981211484, "flos": 17785370684160.0, "grad_norm": 2.5975497323405055, "language_loss": 0.62932581, "learning_rate": 3.827576375036642e-06, "loss": 0.65107965, "num_input_tokens_seen": 57350575, "step": 2647, "time_per_iteration": 2.7405354976654053 }, { "auxiliary_loss_clip": 0.01160144, "auxiliary_loss_mlp": 0.01047955, "balance_loss_clip": 1.05654776, "balance_loss_mlp": 1.02896523, "epoch": 0.1592063730647828, "flos": 17712507945600.0, "grad_norm": 2.2161421076431025, "language_loss": 0.89490473, "learning_rate": 3.827418144547318e-06, "loss": 0.91698575, "num_input_tokens_seen": 57367570, "step": 2648, "time_per_iteration": 2.6193346977233887 }, { "auxiliary_loss_clip": 0.01158791, "auxiliary_loss_mlp": 0.01048086, "balance_loss_clip": 1.05630398, "balance_loss_mlp": 1.03072906, "epoch": 0.15926649631745077, "flos": 18803545603200.0, "grad_norm": 1.9960039108301237, "language_loss": 0.91307199, "learning_rate": 3.827259844762114e-06, "loss": 0.93514073, "num_input_tokens_seen": 57383980, "step": 2649, "time_per_iteration": 2.6137378215789795 }, { "auxiliary_loss_clip": 0.01099661, "auxiliary_loss_mlp": 0.01044384, "balance_loss_clip": 1.05474401, "balance_loss_mlp": 1.02439272, "epoch": 0.15932661957011873, "flos": 17566243764480.0, "grad_norm": 2.3504548368335767, "language_loss": 0.71782613, "learning_rate": 3.827101475687033e-06, "loss": 0.73926663, "num_input_tokens_seen": 57400840, "step": 2650, "time_per_iteration": 2.8883376121520996 }, { "auxiliary_loss_clip": 0.01146809, "auxiliary_loss_mlp": 0.01041815, "balance_loss_clip": 1.05386841, "balance_loss_mlp": 1.02476835, "epoch": 0.15938674282278673, "flos": 13334351011200.0, "grad_norm": 1.8238326955956992, "language_loss": 0.71427429, "learning_rate": 3.826943037328082e-06, "loss": 0.73616046, "num_input_tokens_seen": 57419230, "step": 2651, "time_per_iteration": 2.607879638671875 }, { "auxiliary_loss_clip": 0.01118842, "auxiliary_loss_mlp": 0.00777496, "balance_loss_clip": 1.05154157, "balance_loss_mlp": 1.00132799, "epoch": 0.1594468660754547, "flos": 22488842119680.0, "grad_norm": 1.8928974850955373, "language_loss": 0.80185902, "learning_rate": 3.8267845296912674e-06, "loss": 0.82082248, "num_input_tokens_seen": 57439315, "step": 2652, "time_per_iteration": 2.718695640563965 }, { "auxiliary_loss_clip": 0.01138048, "auxiliary_loss_mlp": 0.00775, "balance_loss_clip": 1.0567826, "balance_loss_mlp": 1.00124729, "epoch": 0.15950698932812266, "flos": 15007320910080.0, "grad_norm": 2.6116065834427387, "language_loss": 0.69539076, "learning_rate": 3.826625952782601e-06, "loss": 0.71452117, "num_input_tokens_seen": 57454635, "step": 2653, "time_per_iteration": 2.7088639736175537 }, { "auxiliary_loss_clip": 0.01144826, "auxiliary_loss_mlp": 0.01038735, "balance_loss_clip": 1.05257821, "balance_loss_mlp": 1.02050805, "epoch": 0.15956711258079062, "flos": 30155052084480.0, "grad_norm": 2.1937273620657307, "language_loss": 0.76670635, "learning_rate": 3.826467306608095e-06, "loss": 0.78854191, "num_input_tokens_seen": 57476805, "step": 2654, "time_per_iteration": 2.79425048828125 }, { "auxiliary_loss_clip": 0.01114313, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.04714727, "balance_loss_mlp": 1.02248931, "epoch": 0.1596272358334586, "flos": 21032700670080.0, "grad_norm": 2.0572535633716247, "language_loss": 0.81873977, "learning_rate": 3.826308591173765e-06, "loss": 0.84029424, "num_input_tokens_seen": 57496400, "step": 2655, "time_per_iteration": 2.6990878582000732 }, { "auxiliary_loss_clip": 0.01112525, "auxiliary_loss_mlp": 0.01046346, "balance_loss_clip": 1.04670715, "balance_loss_mlp": 1.02849984, "epoch": 0.15968735908612655, "flos": 15268032800640.0, "grad_norm": 2.0964800101687486, "language_loss": 0.73768878, "learning_rate": 3.826149806485631e-06, "loss": 0.75927746, "num_input_tokens_seen": 57513700, "step": 2656, "time_per_iteration": 2.7409873008728027 }, { "auxiliary_loss_clip": 0.01111218, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.04749918, "balance_loss_mlp": 1.02220988, "epoch": 0.15974748233879452, "flos": 52665726695040.0, "grad_norm": 2.516351978408242, "language_loss": 0.77637637, "learning_rate": 3.825990952549713e-06, "loss": 0.79788804, "num_input_tokens_seen": 57536180, "step": 2657, "time_per_iteration": 2.984161376953125 }, { "auxiliary_loss_clip": 0.01142397, "auxiliary_loss_mlp": 0.01048058, "balance_loss_clip": 1.05276513, "balance_loss_mlp": 1.02984321, "epoch": 0.1598076055914625, "flos": 18733232730240.0, "grad_norm": 2.1741432296797303, "language_loss": 0.74654955, "learning_rate": 3.825832029372035e-06, "loss": 0.76845407, "num_input_tokens_seen": 57555025, "step": 2658, "time_per_iteration": 2.6795172691345215 }, { "auxiliary_loss_clip": 0.01137294, "auxiliary_loss_mlp": 0.01047097, "balance_loss_clip": 1.05887127, "balance_loss_mlp": 1.02581763, "epoch": 0.15986772884413047, "flos": 34349238535680.0, "grad_norm": 2.2676743120149916, "language_loss": 0.75164986, "learning_rate": 3.825673036958624e-06, "loss": 0.77349377, "num_input_tokens_seen": 57577660, "step": 2659, "time_per_iteration": 2.885744094848633 }, { "auxiliary_loss_clip": 0.01122752, "auxiliary_loss_mlp": 0.0105323, "balance_loss_clip": 1.0512991, "balance_loss_mlp": 1.0334295, "epoch": 0.15992785209679844, "flos": 22054969739520.0, "grad_norm": 2.181311046841435, "language_loss": 0.90998709, "learning_rate": 3.825513975315508e-06, "loss": 0.93174696, "num_input_tokens_seen": 57596335, "step": 2660, "time_per_iteration": 2.7562267780303955 }, { "auxiliary_loss_clip": 0.01114547, "auxiliary_loss_mlp": 0.01058378, "balance_loss_clip": 1.05538487, "balance_loss_mlp": 1.03590751, "epoch": 0.1599879753494664, "flos": 33066652625280.0, "grad_norm": 1.746468400789071, "language_loss": 0.77724659, "learning_rate": 3.82535484444872e-06, "loss": 0.79897583, "num_input_tokens_seen": 57616830, "step": 2661, "time_per_iteration": 2.9896914958953857 }, { "auxiliary_loss_clip": 0.0113781, "auxiliary_loss_mlp": 0.00777461, "balance_loss_clip": 1.05382478, "balance_loss_mlp": 1.00132632, "epoch": 0.16004809860213437, "flos": 28038010343040.0, "grad_norm": 2.0483033922540086, "language_loss": 0.74442393, "learning_rate": 3.825195644364292e-06, "loss": 0.76357663, "num_input_tokens_seen": 57635515, "step": 2662, "time_per_iteration": 2.7993714809417725 }, { "auxiliary_loss_clip": 0.01135674, "auxiliary_loss_mlp": 0.00780783, "balance_loss_clip": 1.05392313, "balance_loss_mlp": 1.0016191, "epoch": 0.16010822185480234, "flos": 22780113505920.0, "grad_norm": 2.9903694104875984, "language_loss": 0.82515085, "learning_rate": 3.825036375068263e-06, "loss": 0.84431541, "num_input_tokens_seen": 57654250, "step": 2663, "time_per_iteration": 2.678490161895752 }, { "auxiliary_loss_clip": 0.01112205, "auxiliary_loss_mlp": 0.01044917, "balance_loss_clip": 1.05182636, "balance_loss_mlp": 1.02574801, "epoch": 0.16016834510747033, "flos": 20084012611200.0, "grad_norm": 2.06786422122115, "language_loss": 0.7951405, "learning_rate": 3.824877036566672e-06, "loss": 0.81671166, "num_input_tokens_seen": 57672645, "step": 2664, "time_per_iteration": 2.819880962371826 }, { "auxiliary_loss_clip": 0.01151449, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.05374622, "balance_loss_mlp": 1.02886605, "epoch": 0.1602284683601383, "flos": 21173829206400.0, "grad_norm": 1.6697703441146605, "language_loss": 0.93748474, "learning_rate": 3.824717628865561e-06, "loss": 0.95947945, "num_input_tokens_seen": 57691055, "step": 2665, "time_per_iteration": 2.697660446166992 }, { "auxiliary_loss_clip": 0.01127607, "auxiliary_loss_mlp": 0.01047415, "balance_loss_clip": 1.05185676, "balance_loss_mlp": 1.02774525, "epoch": 0.16028859161280626, "flos": 14647568244480.0, "grad_norm": 2.9655602739253095, "language_loss": 0.85237324, "learning_rate": 3.824558151970974e-06, "loss": 0.87412339, "num_input_tokens_seen": 57707235, "step": 2666, "time_per_iteration": 4.282273530960083 }, { "auxiliary_loss_clip": 0.01129818, "auxiliary_loss_mlp": 0.00777125, "balance_loss_clip": 1.05257225, "balance_loss_mlp": 1.00145936, "epoch": 0.16034871486547422, "flos": 20990325600000.0, "grad_norm": 1.8366839898970433, "language_loss": 0.81284773, "learning_rate": 3.8243986058889595e-06, "loss": 0.83191717, "num_input_tokens_seen": 57724190, "step": 2667, "time_per_iteration": 2.69508695602417 }, { "auxiliary_loss_clip": 0.0116556, "auxiliary_loss_mlp": 0.01046526, "balance_loss_clip": 1.06089485, "balance_loss_mlp": 1.02643883, "epoch": 0.1604088381181422, "flos": 21397732634880.0, "grad_norm": 1.958935842080623, "language_loss": 0.74031079, "learning_rate": 3.824238990625567e-06, "loss": 0.76243162, "num_input_tokens_seen": 57743620, "step": 2668, "time_per_iteration": 4.2559425830841064 }, { "auxiliary_loss_clip": 0.01148853, "auxiliary_loss_mlp": 0.01051992, "balance_loss_clip": 1.05547619, "balance_loss_mlp": 1.03240585, "epoch": 0.16046896137081015, "flos": 23877040993920.0, "grad_norm": 1.7737626564305047, "language_loss": 0.77495629, "learning_rate": 3.824079306186848e-06, "loss": 0.7969647, "num_input_tokens_seen": 57764810, "step": 2669, "time_per_iteration": 2.6424050331115723 }, { "auxiliary_loss_clip": 0.01097339, "auxiliary_loss_mlp": 0.01012737, "balance_loss_clip": 1.06351233, "balance_loss_mlp": 1.00986385, "epoch": 0.16052908462347812, "flos": 59806709015040.0, "grad_norm": 0.8041290684345284, "language_loss": 0.5549804, "learning_rate": 3.823919552578861e-06, "loss": 0.57608116, "num_input_tokens_seen": 57824390, "step": 2670, "time_per_iteration": 4.765664100646973 }, { "auxiliary_loss_clip": 0.01149639, "auxiliary_loss_mlp": 0.01043383, "balance_loss_clip": 1.05322218, "balance_loss_mlp": 1.02430916, "epoch": 0.1605892078761461, "flos": 18296559089280.0, "grad_norm": 2.6306224128650464, "language_loss": 0.77778888, "learning_rate": 3.82375972980766e-06, "loss": 0.7997191, "num_input_tokens_seen": 57843665, "step": 2671, "time_per_iteration": 2.6876416206359863 }, { "auxiliary_loss_clip": 0.01151164, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.05529547, "balance_loss_mlp": 1.02503204, "epoch": 0.16064933112881408, "flos": 32160734686080.0, "grad_norm": 1.9167251889277674, "language_loss": 0.64766788, "learning_rate": 3.8235998378793086e-06, "loss": 0.66961908, "num_input_tokens_seen": 57863305, "step": 2672, "time_per_iteration": 2.7102553844451904 }, { "auxiliary_loss_clip": 0.01150206, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.05674481, "balance_loss_mlp": 1.02554154, "epoch": 0.16070945438148204, "flos": 19828795501440.0, "grad_norm": 2.045175098484539, "language_loss": 0.85708207, "learning_rate": 3.8234398767998675e-06, "loss": 0.87905198, "num_input_tokens_seen": 57883025, "step": 2673, "time_per_iteration": 2.656360626220703 }, { "auxiliary_loss_clip": 0.01125542, "auxiliary_loss_mlp": 0.01055838, "balance_loss_clip": 1.05366015, "balance_loss_mlp": 1.03716969, "epoch": 0.16076957763415, "flos": 18913144976640.0, "grad_norm": 2.339006860757087, "language_loss": 0.7289716, "learning_rate": 3.823279846575403e-06, "loss": 0.75078535, "num_input_tokens_seen": 57901430, "step": 2674, "time_per_iteration": 2.7122414112091064 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.01045468, "balance_loss_clip": 1.05416465, "balance_loss_mlp": 1.02464211, "epoch": 0.16082970088681797, "flos": 16764358590720.0, "grad_norm": 1.9341682597436423, "language_loss": 0.84438515, "learning_rate": 3.823119747211986e-06, "loss": 0.86634052, "num_input_tokens_seen": 57919550, "step": 2675, "time_per_iteration": 2.6646435260772705 }, { "auxiliary_loss_clip": 0.01116221, "auxiliary_loss_mlp": 0.01049343, "balance_loss_clip": 1.05220723, "balance_loss_mlp": 1.02823126, "epoch": 0.16088982413948594, "flos": 35150261783040.0, "grad_norm": 1.871909119220515, "language_loss": 0.82216591, "learning_rate": 3.822959578715685e-06, "loss": 0.84382153, "num_input_tokens_seen": 57939890, "step": 2676, "time_per_iteration": 2.8457534313201904 }, { "auxiliary_loss_clip": 0.01151157, "auxiliary_loss_mlp": 0.01049874, "balance_loss_clip": 1.05746996, "balance_loss_mlp": 1.03162253, "epoch": 0.1609499473921539, "flos": 18625105814400.0, "grad_norm": 2.1166154816193923, "language_loss": 0.73485494, "learning_rate": 3.822799341092573e-06, "loss": 0.75686526, "num_input_tokens_seen": 57957410, "step": 2677, "time_per_iteration": 2.65387225151062 }, { "auxiliary_loss_clip": 0.01138188, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.05438483, "balance_loss_mlp": 1.02537322, "epoch": 0.1610100706448219, "flos": 33145728416640.0, "grad_norm": 3.229282061984371, "language_loss": 0.76305777, "learning_rate": 3.822639034348728e-06, "loss": 0.78488332, "num_input_tokens_seen": 57977900, "step": 2678, "time_per_iteration": 2.836071014404297 }, { "auxiliary_loss_clip": 0.01148252, "auxiliary_loss_mlp": 0.01047887, "balance_loss_clip": 1.05379987, "balance_loss_mlp": 1.02789569, "epoch": 0.16107019389748986, "flos": 34676707852800.0, "grad_norm": 8.295814069484678, "language_loss": 0.70340431, "learning_rate": 3.822478658490228e-06, "loss": 0.7253657, "num_input_tokens_seen": 57998210, "step": 2679, "time_per_iteration": 2.771185874938965 }, { "auxiliary_loss_clip": 0.01059502, "auxiliary_loss_mlp": 0.00758644, "balance_loss_clip": 1.04695845, "balance_loss_mlp": 1.00150955, "epoch": 0.16113031715015783, "flos": 65713403260800.0, "grad_norm": 0.7819629653273137, "language_loss": 0.51843339, "learning_rate": 3.822318213523154e-06, "loss": 0.53661484, "num_input_tokens_seen": 58059420, "step": 2680, "time_per_iteration": 3.3107378482818604 }, { "auxiliary_loss_clip": 0.01144342, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.05360317, "balance_loss_mlp": 1.02632904, "epoch": 0.1611904404028258, "flos": 20810413353600.0, "grad_norm": 1.6718368455031125, "language_loss": 0.8028667, "learning_rate": 3.8221576994535925e-06, "loss": 0.82478368, "num_input_tokens_seen": 58078370, "step": 2681, "time_per_iteration": 2.6986513137817383 }, { "auxiliary_loss_clip": 0.01139192, "auxiliary_loss_mlp": 0.01055518, "balance_loss_clip": 1.05603266, "balance_loss_mlp": 1.03602743, "epoch": 0.16125056365549376, "flos": 27013335062400.0, "grad_norm": 2.154781054673542, "language_loss": 0.68957973, "learning_rate": 3.821997116287627e-06, "loss": 0.71152687, "num_input_tokens_seen": 58097395, "step": 2682, "time_per_iteration": 2.794686794281006 }, { "auxiliary_loss_clip": 0.01139216, "auxiliary_loss_mlp": 0.01052349, "balance_loss_clip": 1.05670619, "balance_loss_mlp": 1.03195262, "epoch": 0.16131068690816172, "flos": 19276524915840.0, "grad_norm": 1.9802191055590168, "language_loss": 0.87362224, "learning_rate": 3.821836464031348e-06, "loss": 0.89553785, "num_input_tokens_seen": 58115630, "step": 2683, "time_per_iteration": 2.703634262084961 }, { "auxiliary_loss_clip": 0.01165497, "auxiliary_loss_mlp": 0.0105575, "balance_loss_clip": 1.05714059, "balance_loss_mlp": 1.03491259, "epoch": 0.16137081016082971, "flos": 35337931367040.0, "grad_norm": 1.939499216066865, "language_loss": 0.74143028, "learning_rate": 3.821675742690849e-06, "loss": 0.76364273, "num_input_tokens_seen": 58138655, "step": 2684, "time_per_iteration": 2.7890264987945557 }, { "auxiliary_loss_clip": 0.01136683, "auxiliary_loss_mlp": 0.00778989, "balance_loss_clip": 1.05435085, "balance_loss_mlp": 1.00176883, "epoch": 0.16143093341349768, "flos": 34235257703040.0, "grad_norm": 1.9009911635557044, "language_loss": 0.70506597, "learning_rate": 3.821514952272223e-06, "loss": 0.72422272, "num_input_tokens_seen": 58157440, "step": 2685, "time_per_iteration": 2.803942918777466 }, { "auxiliary_loss_clip": 0.01116315, "auxiliary_loss_mlp": 0.01059092, "balance_loss_clip": 1.05291295, "balance_loss_mlp": 1.03757524, "epoch": 0.16149105666616564, "flos": 27999262546560.0, "grad_norm": 2.295686008167468, "language_loss": 0.72060591, "learning_rate": 3.821354092781567e-06, "loss": 0.74236, "num_input_tokens_seen": 58176660, "step": 2686, "time_per_iteration": 2.850309133529663 }, { "auxiliary_loss_clip": 0.01153803, "auxiliary_loss_mlp": 0.01048887, "balance_loss_clip": 1.05603862, "balance_loss_mlp": 1.02922952, "epoch": 0.1615511799188336, "flos": 19422214479360.0, "grad_norm": 2.056921120199424, "language_loss": 0.81720114, "learning_rate": 3.821193164224981e-06, "loss": 0.83922803, "num_input_tokens_seen": 58195085, "step": 2687, "time_per_iteration": 2.7077832221984863 }, { "auxiliary_loss_clip": 0.01154388, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.05335689, "balance_loss_mlp": 1.02910483, "epoch": 0.16161130317150157, "flos": 22854915578880.0, "grad_norm": 1.6747986106054085, "language_loss": 0.71680355, "learning_rate": 3.821032166608568e-06, "loss": 0.73885429, "num_input_tokens_seen": 58213540, "step": 2688, "time_per_iteration": 2.700073480606079 }, { "auxiliary_loss_clip": 0.0112226, "auxiliary_loss_mlp": 0.0105252, "balance_loss_clip": 1.0517168, "balance_loss_mlp": 1.03330338, "epoch": 0.16167142642416954, "flos": 26110577520000.0, "grad_norm": 2.2887064413695253, "language_loss": 0.76168394, "learning_rate": 3.8208710999384325e-06, "loss": 0.78343177, "num_input_tokens_seen": 58236995, "step": 2689, "time_per_iteration": 2.846964120864868 }, { "auxiliary_loss_clip": 0.01166324, "auxiliary_loss_mlp": 0.01052979, "balance_loss_clip": 1.05979431, "balance_loss_mlp": 1.03308284, "epoch": 0.1617315496768375, "flos": 22779646629120.0, "grad_norm": 2.045037041298705, "language_loss": 0.87211925, "learning_rate": 3.820709964220683e-06, "loss": 0.89431226, "num_input_tokens_seen": 58257230, "step": 2690, "time_per_iteration": 2.704497814178467 }, { "auxiliary_loss_clip": 0.01143898, "auxiliary_loss_mlp": 0.01046571, "balance_loss_clip": 1.05318451, "balance_loss_mlp": 1.02890396, "epoch": 0.1617916729295055, "flos": 22017299351040.0, "grad_norm": 1.7518031225399346, "language_loss": 0.87899524, "learning_rate": 3.8205487594614284e-06, "loss": 0.90089989, "num_input_tokens_seen": 58277080, "step": 2691, "time_per_iteration": 2.6763153076171875 }, { "auxiliary_loss_clip": 0.01150265, "auxiliary_loss_mlp": 0.01053114, "balance_loss_clip": 1.05237532, "balance_loss_mlp": 1.03142977, "epoch": 0.16185179618217346, "flos": 23438248450560.0, "grad_norm": 2.1723450057475313, "language_loss": 0.81989783, "learning_rate": 3.820387485666784e-06, "loss": 0.84193164, "num_input_tokens_seen": 58294815, "step": 2692, "time_per_iteration": 2.6381001472473145 }, { "auxiliary_loss_clip": 0.01167881, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.05555534, "balance_loss_mlp": 1.02499604, "epoch": 0.16191191943484143, "flos": 25666110627840.0, "grad_norm": 2.194958172554253, "language_loss": 0.81381011, "learning_rate": 3.820226142842862e-06, "loss": 0.83594954, "num_input_tokens_seen": 58313215, "step": 2693, "time_per_iteration": 2.6366944313049316 }, { "auxiliary_loss_clip": 0.01164466, "auxiliary_loss_mlp": 0.01058298, "balance_loss_clip": 1.0587461, "balance_loss_mlp": 1.03991616, "epoch": 0.1619720426875094, "flos": 23477355383040.0, "grad_norm": 2.778189532536263, "language_loss": 0.83837044, "learning_rate": 3.820064730995783e-06, "loss": 0.86059809, "num_input_tokens_seen": 58333215, "step": 2694, "time_per_iteration": 2.7802140712738037 }, { "auxiliary_loss_clip": 0.01116209, "auxiliary_loss_mlp": 0.0105764, "balance_loss_clip": 1.04927421, "balance_loss_mlp": 1.0366354, "epoch": 0.16203216594017736, "flos": 24133658734080.0, "grad_norm": 1.8201511645490482, "language_loss": 0.69709098, "learning_rate": 3.819903250131667e-06, "loss": 0.71882945, "num_input_tokens_seen": 58351160, "step": 2695, "time_per_iteration": 2.756904125213623 }, { "auxiliary_loss_clip": 0.01155526, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.05799723, "balance_loss_mlp": 1.03026128, "epoch": 0.16209228919284532, "flos": 22340889999360.0, "grad_norm": 2.1550523064219487, "language_loss": 0.82986331, "learning_rate": 3.819741700256637e-06, "loss": 0.85192692, "num_input_tokens_seen": 58368505, "step": 2696, "time_per_iteration": 2.651510238647461 }, { "auxiliary_loss_clip": 0.01174193, "auxiliary_loss_mlp": 0.01052819, "balance_loss_clip": 1.05826569, "balance_loss_mlp": 1.03095615, "epoch": 0.1621524124455133, "flos": 15815131827840.0, "grad_norm": 2.9267990143146503, "language_loss": 0.8862049, "learning_rate": 3.8195800813768194e-06, "loss": 0.90847504, "num_input_tokens_seen": 58385085, "step": 2697, "time_per_iteration": 2.5935380458831787 }, { "auxiliary_loss_clip": 0.01158945, "auxiliary_loss_mlp": 0.01045471, "balance_loss_clip": 1.0552485, "balance_loss_mlp": 1.02719641, "epoch": 0.16221253569818128, "flos": 30186688988160.0, "grad_norm": 1.7480298293719791, "language_loss": 0.80844599, "learning_rate": 3.819418393498343e-06, "loss": 0.83049017, "num_input_tokens_seen": 58406985, "step": 2698, "time_per_iteration": 2.6685965061187744 }, { "auxiliary_loss_clip": 0.01151678, "auxiliary_loss_mlp": 0.01050084, "balance_loss_clip": 1.05785704, "balance_loss_mlp": 1.03060579, "epoch": 0.16227265895084925, "flos": 24605991601920.0, "grad_norm": 1.590231062064763, "language_loss": 0.77499473, "learning_rate": 3.819256636627339e-06, "loss": 0.79701245, "num_input_tokens_seen": 58426205, "step": 2699, "time_per_iteration": 2.7206287384033203 }, { "auxiliary_loss_clip": 0.01134482, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.0504272, "balance_loss_mlp": 1.02510071, "epoch": 0.1623327822035172, "flos": 19573326996480.0, "grad_norm": 2.299083669251571, "language_loss": 0.85903585, "learning_rate": 3.81909481076994e-06, "loss": 0.88081944, "num_input_tokens_seen": 58443830, "step": 2700, "time_per_iteration": 2.6440224647521973 }, { "auxiliary_loss_clip": 0.01150266, "auxiliary_loss_mlp": 0.00778348, "balance_loss_clip": 1.05360484, "balance_loss_mlp": 1.00180686, "epoch": 0.16239290545618518, "flos": 26468462678400.0, "grad_norm": 1.7679372116400307, "language_loss": 0.80424523, "learning_rate": 3.818932915932284e-06, "loss": 0.82353133, "num_input_tokens_seen": 58464405, "step": 2701, "time_per_iteration": 2.6943976879119873 }, { "auxiliary_loss_clip": 0.01144477, "auxiliary_loss_mlp": 0.01046291, "balance_loss_clip": 1.05771017, "balance_loss_mlp": 1.02664542, "epoch": 0.16245302870885314, "flos": 15851940289920.0, "grad_norm": 1.6539412057050027, "language_loss": 0.72777367, "learning_rate": 3.818770952120511e-06, "loss": 0.74968135, "num_input_tokens_seen": 58483295, "step": 2702, "time_per_iteration": 2.6914141178131104 }, { "auxiliary_loss_clip": 0.01156069, "auxiliary_loss_mlp": 0.01050141, "balance_loss_clip": 1.05802381, "balance_loss_mlp": 1.02896905, "epoch": 0.1625131519615211, "flos": 14756521173120.0, "grad_norm": 1.8265391375227176, "language_loss": 0.7273894, "learning_rate": 3.81860891934076e-06, "loss": 0.74945152, "num_input_tokens_seen": 58501205, "step": 2703, "time_per_iteration": 2.6301820278167725 }, { "auxiliary_loss_clip": 0.01165642, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.0553968, "balance_loss_mlp": 1.02942359, "epoch": 0.1625732752141891, "flos": 28220508368640.0, "grad_norm": 3.0329584489902666, "language_loss": 0.70018482, "learning_rate": 3.818446817599176e-06, "loss": 0.72234988, "num_input_tokens_seen": 58522315, "step": 2704, "time_per_iteration": 2.6667227745056152 }, { "auxiliary_loss_clip": 0.01034679, "auxiliary_loss_mlp": 0.01001657, "balance_loss_clip": 1.03343439, "balance_loss_mlp": 0.99865305, "epoch": 0.16263339846685707, "flos": 67327947688320.0, "grad_norm": 0.7801109588151329, "language_loss": 0.5336051, "learning_rate": 3.818284646901907e-06, "loss": 0.55396849, "num_input_tokens_seen": 58586695, "step": 2705, "time_per_iteration": 4.808594465255737 }, { "auxiliary_loss_clip": 0.01138628, "auxiliary_loss_mlp": 0.00781324, "balance_loss_clip": 1.0539608, "balance_loss_mlp": 1.00171995, "epoch": 0.16269352171952503, "flos": 14319165173760.0, "grad_norm": 2.3827832530074455, "language_loss": 0.7536028, "learning_rate": 3.818122407255102e-06, "loss": 0.77280229, "num_input_tokens_seen": 58602435, "step": 2706, "time_per_iteration": 4.126614570617676 }, { "auxiliary_loss_clip": 0.01130684, "auxiliary_loss_mlp": 0.01047489, "balance_loss_clip": 1.0523324, "balance_loss_mlp": 1.02859437, "epoch": 0.162753644972193, "flos": 28361205941760.0, "grad_norm": 2.2272392184651038, "language_loss": 0.72203928, "learning_rate": 3.817960098664914e-06, "loss": 0.74382102, "num_input_tokens_seen": 58621275, "step": 2707, "time_per_iteration": 4.2739410400390625 }, { "auxiliary_loss_clip": 0.01142142, "auxiliary_loss_mlp": 0.01047652, "balance_loss_clip": 1.05433679, "balance_loss_mlp": 1.02898431, "epoch": 0.16281376822486096, "flos": 19937856170880.0, "grad_norm": 3.192481802987827, "language_loss": 0.83481139, "learning_rate": 3.817797721137495e-06, "loss": 0.85670936, "num_input_tokens_seen": 58637550, "step": 2708, "time_per_iteration": 2.7163965702056885 }, { "auxiliary_loss_clip": 0.01101561, "auxiliary_loss_mlp": 0.00781217, "balance_loss_clip": 1.04896522, "balance_loss_mlp": 1.00177419, "epoch": 0.16287389147752893, "flos": 21251719848960.0, "grad_norm": 2.2850459718507654, "language_loss": 0.86162847, "learning_rate": 3.817635274679006e-06, "loss": 0.88045627, "num_input_tokens_seen": 58654135, "step": 2709, "time_per_iteration": 4.474989652633667 }, { "auxiliary_loss_clip": 0.0114031, "auxiliary_loss_mlp": 0.00777602, "balance_loss_clip": 1.05267572, "balance_loss_mlp": 1.00172114, "epoch": 0.1629340147301969, "flos": 19244672530560.0, "grad_norm": 2.581053296112052, "language_loss": 0.91410124, "learning_rate": 3.817472759295605e-06, "loss": 0.93328035, "num_input_tokens_seen": 58674320, "step": 2710, "time_per_iteration": 2.6951892375946045 }, { "auxiliary_loss_clip": 0.01118597, "auxiliary_loss_mlp": 0.01054854, "balance_loss_clip": 1.05254805, "balance_loss_mlp": 1.03451669, "epoch": 0.16299413798286488, "flos": 21249816428160.0, "grad_norm": 2.4322540773438437, "language_loss": 0.81690979, "learning_rate": 3.817310174993453e-06, "loss": 0.83864427, "num_input_tokens_seen": 58691000, "step": 2711, "time_per_iteration": 2.7854437828063965 }, { "auxiliary_loss_clip": 0.01146056, "auxiliary_loss_mlp": 0.01040648, "balance_loss_clip": 1.04954815, "balance_loss_mlp": 1.02107334, "epoch": 0.16305426123553285, "flos": 18770579896320.0, "grad_norm": 3.73256798888747, "language_loss": 0.8091476, "learning_rate": 3.817147521778719e-06, "loss": 0.83101463, "num_input_tokens_seen": 58710230, "step": 2712, "time_per_iteration": 2.834291458129883 }, { "auxiliary_loss_clip": 0.01171211, "auxiliary_loss_mlp": 0.01053015, "balance_loss_clip": 1.0590024, "balance_loss_mlp": 1.03273714, "epoch": 0.16311438448820081, "flos": 22087648137600.0, "grad_norm": 2.3460895846171996, "language_loss": 0.7681579, "learning_rate": 3.816984799657568e-06, "loss": 0.79040015, "num_input_tokens_seen": 58728610, "step": 2713, "time_per_iteration": 2.6188278198242188 }, { "auxiliary_loss_clip": 0.01156539, "auxiliary_loss_mlp": 0.0105792, "balance_loss_clip": 1.06240916, "balance_loss_mlp": 1.03832221, "epoch": 0.16317450774086878, "flos": 16467700164480.0, "grad_norm": 2.543173325075216, "language_loss": 0.79012156, "learning_rate": 3.8168220086361715e-06, "loss": 0.81226611, "num_input_tokens_seen": 58744385, "step": 2714, "time_per_iteration": 2.6534018516540527 }, { "auxiliary_loss_clip": 0.01149567, "auxiliary_loss_mlp": 0.01056152, "balance_loss_clip": 1.05467987, "balance_loss_mlp": 1.03724504, "epoch": 0.16323463099353674, "flos": 24352929308160.0, "grad_norm": 1.614702766215493, "language_loss": 0.77693665, "learning_rate": 3.816659148720702e-06, "loss": 0.79899377, "num_input_tokens_seen": 58763905, "step": 2715, "time_per_iteration": 2.856006383895874 }, { "auxiliary_loss_clip": 0.01129437, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.04810584, "balance_loss_mlp": 1.02525854, "epoch": 0.1632947542462047, "flos": 24900782520960.0, "grad_norm": 2.374975046722651, "language_loss": 0.81513858, "learning_rate": 3.816496219917336e-06, "loss": 0.83687335, "num_input_tokens_seen": 58785580, "step": 2716, "time_per_iteration": 2.6750845909118652 }, { "auxiliary_loss_clip": 0.01144393, "auxiliary_loss_mlp": 0.01055927, "balance_loss_clip": 1.05851114, "balance_loss_mlp": 1.03703237, "epoch": 0.1633548774988727, "flos": 24900279730560.0, "grad_norm": 1.8186679286330678, "language_loss": 0.86522418, "learning_rate": 3.816333222232251e-06, "loss": 0.88722742, "num_input_tokens_seen": 58806075, "step": 2717, "time_per_iteration": 2.761622428894043 }, { "auxiliary_loss_clip": 0.01135377, "auxiliary_loss_mlp": 0.01045964, "balance_loss_clip": 1.05334044, "balance_loss_mlp": 1.0274632, "epoch": 0.16341500075154067, "flos": 30441798357120.0, "grad_norm": 1.8799656187942837, "language_loss": 0.76924133, "learning_rate": 3.816170155671629e-06, "loss": 0.79105473, "num_input_tokens_seen": 58827405, "step": 2718, "time_per_iteration": 2.7946770191192627 }, { "auxiliary_loss_clip": 0.01145146, "auxiliary_loss_mlp": 0.01043682, "balance_loss_clip": 1.05553615, "balance_loss_mlp": 1.02566922, "epoch": 0.16347512400420863, "flos": 22784530878720.0, "grad_norm": 2.2449478392049906, "language_loss": 0.73827291, "learning_rate": 3.816007020241652e-06, "loss": 0.76016116, "num_input_tokens_seen": 58847205, "step": 2719, "time_per_iteration": 2.719980478286743 }, { "auxiliary_loss_clip": 0.01128361, "auxiliary_loss_mlp": 0.01045887, "balance_loss_clip": 1.04900515, "balance_loss_mlp": 1.02732563, "epoch": 0.1635352472568766, "flos": 22633274707200.0, "grad_norm": 1.7092252575708884, "language_loss": 0.72267497, "learning_rate": 3.815843815948507e-06, "loss": 0.74441749, "num_input_tokens_seen": 58866865, "step": 2720, "time_per_iteration": 2.8737292289733887 }, { "auxiliary_loss_clip": 0.01109456, "auxiliary_loss_mlp": 0.01049703, "balance_loss_clip": 1.05004287, "balance_loss_mlp": 1.02840054, "epoch": 0.16359537050954456, "flos": 15522998515200.0, "grad_norm": 2.1621365878543153, "language_loss": 0.75120997, "learning_rate": 3.8156805427983824e-06, "loss": 0.77280164, "num_input_tokens_seen": 58885200, "step": 2721, "time_per_iteration": 2.785296678543091 }, { "auxiliary_loss_clip": 0.01110342, "auxiliary_loss_mlp": 0.01059955, "balance_loss_clip": 1.04597676, "balance_loss_mlp": 1.03734064, "epoch": 0.16365549376221253, "flos": 22090162089600.0, "grad_norm": 1.9032438792006017, "language_loss": 0.79073942, "learning_rate": 3.8155172007974695e-06, "loss": 0.81244236, "num_input_tokens_seen": 58906385, "step": 2722, "time_per_iteration": 2.7850708961486816 }, { "auxiliary_loss_clip": 0.01149809, "auxiliary_loss_mlp": 0.00778798, "balance_loss_clip": 1.05395257, "balance_loss_mlp": 1.00171757, "epoch": 0.1637156170148805, "flos": 24060400945920.0, "grad_norm": 2.3019049903761215, "language_loss": 0.84954333, "learning_rate": 3.8153537899519624e-06, "loss": 0.86882937, "num_input_tokens_seen": 58925040, "step": 2723, "time_per_iteration": 2.7268764972686768 }, { "auxiliary_loss_clip": 0.01108328, "auxiliary_loss_mlp": 0.01044851, "balance_loss_clip": 1.04805517, "balance_loss_mlp": 1.02493143, "epoch": 0.1637757402675485, "flos": 26685362954880.0, "grad_norm": 1.8985615531712963, "language_loss": 0.71018666, "learning_rate": 3.815190310268058e-06, "loss": 0.73171842, "num_input_tokens_seen": 58944790, "step": 2724, "time_per_iteration": 2.7691783905029297 }, { "auxiliary_loss_clip": 0.01118053, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.05226958, "balance_loss_mlp": 1.02364373, "epoch": 0.16383586352021645, "flos": 16106941918080.0, "grad_norm": 2.1059770262776136, "language_loss": 0.70552838, "learning_rate": 3.815026761751955e-06, "loss": 0.72712779, "num_input_tokens_seen": 58962500, "step": 2725, "time_per_iteration": 2.6936957836151123 }, { "auxiliary_loss_clip": 0.01112368, "auxiliary_loss_mlp": 0.01046594, "balance_loss_clip": 1.04912174, "balance_loss_mlp": 1.028391, "epoch": 0.16389598677288442, "flos": 19165991788800.0, "grad_norm": 2.27810298992254, "language_loss": 0.88491893, "learning_rate": 3.814863144409855e-06, "loss": 0.90650856, "num_input_tokens_seen": 58980355, "step": 2726, "time_per_iteration": 2.7967143058776855 }, { "auxiliary_loss_clip": 0.01157668, "auxiliary_loss_mlp": 0.0105068, "balance_loss_clip": 1.06062055, "balance_loss_mlp": 1.03099847, "epoch": 0.16395611002555238, "flos": 21507008785920.0, "grad_norm": 2.0584475237926303, "language_loss": 0.7469939, "learning_rate": 3.814699458247963e-06, "loss": 0.7690773, "num_input_tokens_seen": 58999505, "step": 2727, "time_per_iteration": 2.6818623542785645 }, { "auxiliary_loss_clip": 0.01150971, "auxiliary_loss_mlp": 0.01052077, "balance_loss_clip": 1.0570507, "balance_loss_mlp": 1.03527999, "epoch": 0.16401623327822035, "flos": 21470918595840.0, "grad_norm": 1.6112579442237729, "language_loss": 0.83097756, "learning_rate": 3.8145357032724855e-06, "loss": 0.85300803, "num_input_tokens_seen": 59017930, "step": 2728, "time_per_iteration": 2.675360918045044 }, { "auxiliary_loss_clip": 0.01156153, "auxiliary_loss_mlp": 0.01045609, "balance_loss_clip": 1.05826735, "balance_loss_mlp": 1.02602315, "epoch": 0.1640763565308883, "flos": 13626232928640.0, "grad_norm": 2.5738755626941106, "language_loss": 0.84892929, "learning_rate": 3.814371879489633e-06, "loss": 0.87094688, "num_input_tokens_seen": 59035130, "step": 2729, "time_per_iteration": 2.7004599571228027 }, { "auxiliary_loss_clip": 0.01167293, "auxiliary_loss_mlp": 0.01048461, "balance_loss_clip": 1.0591594, "balance_loss_mlp": 1.03053224, "epoch": 0.16413647978355628, "flos": 15451464579840.0, "grad_norm": 1.9897225699042427, "language_loss": 0.72895479, "learning_rate": 3.814207986905616e-06, "loss": 0.75111228, "num_input_tokens_seen": 59053080, "step": 2730, "time_per_iteration": 2.593179702758789 }, { "auxiliary_loss_clip": 0.01142509, "auxiliary_loss_mlp": 0.01050071, "balance_loss_clip": 1.05208349, "balance_loss_mlp": 1.02908981, "epoch": 0.16419660303622427, "flos": 45878682015360.0, "grad_norm": 1.6754501336017709, "language_loss": 0.74384654, "learning_rate": 3.814044025526651e-06, "loss": 0.76577234, "num_input_tokens_seen": 59075610, "step": 2731, "time_per_iteration": 2.8702962398529053 }, { "auxiliary_loss_clip": 0.01122791, "auxiliary_loss_mlp": 0.01047176, "balance_loss_clip": 1.05006754, "balance_loss_mlp": 1.02650499, "epoch": 0.16425672628889224, "flos": 18952826526720.0, "grad_norm": 2.031351475505915, "language_loss": 0.79190683, "learning_rate": 3.8138799953589548e-06, "loss": 0.8136065, "num_input_tokens_seen": 59094555, "step": 2732, "time_per_iteration": 2.734529972076416 }, { "auxiliary_loss_clip": 0.01141118, "auxiliary_loss_mlp": 0.01047385, "balance_loss_clip": 1.05340672, "balance_loss_mlp": 1.02796555, "epoch": 0.1643168495415602, "flos": 24312996362880.0, "grad_norm": 2.250003976384769, "language_loss": 0.69526887, "learning_rate": 3.8137158964087473e-06, "loss": 0.71715385, "num_input_tokens_seen": 59113515, "step": 2733, "time_per_iteration": 2.672377109527588 }, { "auxiliary_loss_clip": 0.01143332, "auxiliary_loss_mlp": 0.01053232, "balance_loss_clip": 1.05603123, "balance_loss_mlp": 1.0325135, "epoch": 0.16437697279422817, "flos": 26428421992320.0, "grad_norm": 2.000873580428856, "language_loss": 0.80976766, "learning_rate": 3.8135517286822508e-06, "loss": 0.83173329, "num_input_tokens_seen": 59133275, "step": 2734, "time_per_iteration": 2.710293769836426 }, { "auxiliary_loss_clip": 0.01135758, "auxiliary_loss_mlp": 0.01056722, "balance_loss_clip": 1.05488348, "balance_loss_mlp": 1.03470409, "epoch": 0.16443709604689613, "flos": 34532239351680.0, "grad_norm": 2.100664117201308, "language_loss": 0.81810421, "learning_rate": 3.8133874921856914e-06, "loss": 0.840029, "num_input_tokens_seen": 59154095, "step": 2735, "time_per_iteration": 2.8074140548706055 }, { "auxiliary_loss_clip": 0.01070875, "auxiliary_loss_mlp": 0.01044313, "balance_loss_clip": 1.04323888, "balance_loss_mlp": 1.02508426, "epoch": 0.1644972192995641, "flos": 23258048895360.0, "grad_norm": 2.405088987017839, "language_loss": 0.78515649, "learning_rate": 3.813223186925296e-06, "loss": 0.80630839, "num_input_tokens_seen": 59173795, "step": 2736, "time_per_iteration": 2.839087963104248 }, { "auxiliary_loss_clip": 0.01147998, "auxiliary_loss_mlp": 0.01054659, "balance_loss_clip": 1.05859447, "balance_loss_mlp": 1.03513288, "epoch": 0.1645573425522321, "flos": 26979543342720.0, "grad_norm": 1.9462182296456145, "language_loss": 0.81052899, "learning_rate": 3.8130588129072964e-06, "loss": 0.83255553, "num_input_tokens_seen": 59191610, "step": 2737, "time_per_iteration": 2.7328996658325195 }, { "auxiliary_loss_clip": 0.01150424, "auxiliary_loss_mlp": 0.01052207, "balance_loss_clip": 1.0559026, "balance_loss_mlp": 1.03065443, "epoch": 0.16461746580490005, "flos": 28731768600960.0, "grad_norm": 1.8596348168124566, "language_loss": 0.87449318, "learning_rate": 3.8128943701379246e-06, "loss": 0.89651948, "num_input_tokens_seen": 59213000, "step": 2738, "time_per_iteration": 2.7345526218414307 }, { "auxiliary_loss_clip": 0.01139154, "auxiliary_loss_mlp": 0.0106055, "balance_loss_clip": 1.05534518, "balance_loss_mlp": 1.04079759, "epoch": 0.16467758905756802, "flos": 24930156867840.0, "grad_norm": 1.728421510231393, "language_loss": 0.71997833, "learning_rate": 3.8127298586234167e-06, "loss": 0.74197543, "num_input_tokens_seen": 59232340, "step": 2739, "time_per_iteration": 2.7091422080993652 }, { "auxiliary_loss_clip": 0.01154419, "auxiliary_loss_mlp": 0.0105106, "balance_loss_clip": 1.05673754, "balance_loss_mlp": 1.0312835, "epoch": 0.16473771231023598, "flos": 24826519152000.0, "grad_norm": 1.8559436932352185, "language_loss": 0.81645715, "learning_rate": 3.8125652783700104e-06, "loss": 0.83851194, "num_input_tokens_seen": 59253950, "step": 2740, "time_per_iteration": 2.712658166885376 }, { "auxiliary_loss_clip": 0.01114061, "auxiliary_loss_mlp": 0.01068725, "balance_loss_clip": 1.04991829, "balance_loss_mlp": 1.04307163, "epoch": 0.16479783556290395, "flos": 39896072375040.0, "grad_norm": 2.0528021789830837, "language_loss": 0.69467485, "learning_rate": 3.8124006293839475e-06, "loss": 0.71650267, "num_input_tokens_seen": 59275545, "step": 2741, "time_per_iteration": 2.8629493713378906 }, { "auxiliary_loss_clip": 0.01167543, "auxiliary_loss_mlp": 0.01048721, "balance_loss_clip": 1.05907226, "balance_loss_mlp": 1.02906334, "epoch": 0.16485795881557191, "flos": 19897061299200.0, "grad_norm": 1.7765193730452222, "language_loss": 0.79811072, "learning_rate": 3.812235911671472e-06, "loss": 0.8202734, "num_input_tokens_seen": 59293480, "step": 2742, "time_per_iteration": 2.626775026321411 }, { "auxiliary_loss_clip": 0.01141681, "auxiliary_loss_mlp": 0.01055663, "balance_loss_clip": 1.05664062, "balance_loss_mlp": 1.03477716, "epoch": 0.16491808206823988, "flos": 20556129997440.0, "grad_norm": 1.91797408289014, "language_loss": 0.8499459, "learning_rate": 3.8120711252388274e-06, "loss": 0.87191939, "num_input_tokens_seen": 59313435, "step": 2743, "time_per_iteration": 2.8218302726745605 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.01051969, "balance_loss_clip": 1.05743837, "balance_loss_mlp": 1.03196514, "epoch": 0.16497820532090787, "flos": 23800802376960.0, "grad_norm": 1.4425200129075006, "language_loss": 0.85558498, "learning_rate": 3.811906270092265e-06, "loss": 0.87772918, "num_input_tokens_seen": 59331535, "step": 2744, "time_per_iteration": 4.206263542175293 }, { "auxiliary_loss_clip": 0.01131671, "auxiliary_loss_mlp": 0.0104676, "balance_loss_clip": 1.05206287, "balance_loss_mlp": 1.02812767, "epoch": 0.16503832857357584, "flos": 25482642935040.0, "grad_norm": 1.6285200980820358, "language_loss": 0.82770813, "learning_rate": 3.811741346238036e-06, "loss": 0.84949243, "num_input_tokens_seen": 59350680, "step": 2745, "time_per_iteration": 4.331594467163086 }, { "auxiliary_loss_clip": 0.011344, "auxiliary_loss_mlp": 0.01057242, "balance_loss_clip": 1.05874014, "balance_loss_mlp": 1.03825223, "epoch": 0.1650984518262438, "flos": 17676058619520.0, "grad_norm": 6.766690288332402, "language_loss": 0.76811314, "learning_rate": 3.8115763536823923e-06, "loss": 0.79002959, "num_input_tokens_seen": 59367020, "step": 2746, "time_per_iteration": 4.225586414337158 }, { "auxiliary_loss_clip": 0.01164296, "auxiliary_loss_mlp": 0.01055636, "balance_loss_clip": 1.05781221, "balance_loss_mlp": 1.03533494, "epoch": 0.16515857507891177, "flos": 18698327688960.0, "grad_norm": 1.9760186874049024, "language_loss": 0.80818808, "learning_rate": 3.811411292431592e-06, "loss": 0.83038735, "num_input_tokens_seen": 59386075, "step": 2747, "time_per_iteration": 2.6862480640411377 }, { "auxiliary_loss_clip": 0.01157975, "auxiliary_loss_mlp": 0.0104673, "balance_loss_clip": 1.05990267, "balance_loss_mlp": 1.02664328, "epoch": 0.16521869833157973, "flos": 15010481306880.0, "grad_norm": 2.0608482379031337, "language_loss": 0.69433749, "learning_rate": 3.8112461624918945e-06, "loss": 0.71638453, "num_input_tokens_seen": 59402690, "step": 2748, "time_per_iteration": 2.6520986557006836 }, { "auxiliary_loss_clip": 0.01169692, "auxiliary_loss_mlp": 0.00778195, "balance_loss_clip": 1.06237423, "balance_loss_mlp": 1.00173104, "epoch": 0.1652788215842477, "flos": 22121152548480.0, "grad_norm": 2.259215537482641, "language_loss": 0.88012803, "learning_rate": 3.811080963869561e-06, "loss": 0.89960694, "num_input_tokens_seen": 59421130, "step": 2749, "time_per_iteration": 4.260679244995117 }, { "auxiliary_loss_clip": 0.01154179, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05586052, "balance_loss_mlp": 1.02542281, "epoch": 0.16533894483691566, "flos": 18333080242560.0, "grad_norm": 2.0880864906339864, "language_loss": 0.79240286, "learning_rate": 3.8109156965708557e-06, "loss": 0.81440079, "num_input_tokens_seen": 59438970, "step": 2750, "time_per_iteration": 2.6335251331329346 }, { "auxiliary_loss_clip": 0.01153343, "auxiliary_loss_mlp": 0.0104591, "balance_loss_clip": 1.0579437, "balance_loss_mlp": 1.02602625, "epoch": 0.16539906808958366, "flos": 22382115834240.0, "grad_norm": 1.6952801391084946, "language_loss": 0.94854712, "learning_rate": 3.8107503606020455e-06, "loss": 0.97053963, "num_input_tokens_seen": 59458510, "step": 2751, "time_per_iteration": 2.697174310684204 }, { "auxiliary_loss_clip": 0.0106803, "auxiliary_loss_mlp": 0.0105236, "balance_loss_clip": 1.04625726, "balance_loss_mlp": 1.03247619, "epoch": 0.16545919134225162, "flos": 22711093522560.0, "grad_norm": 2.614588592950962, "language_loss": 0.71231711, "learning_rate": 3.8105849559693997e-06, "loss": 0.73352098, "num_input_tokens_seen": 59477110, "step": 2752, "time_per_iteration": 2.7780745029449463 }, { "auxiliary_loss_clip": 0.01090521, "auxiliary_loss_mlp": 0.01022104, "balance_loss_clip": 1.05741131, "balance_loss_mlp": 1.01941013, "epoch": 0.1655193145949196, "flos": 67802974076160.0, "grad_norm": 0.7721529651221379, "language_loss": 0.54058975, "learning_rate": 3.810419482679192e-06, "loss": 0.56171602, "num_input_tokens_seen": 59541155, "step": 2753, "time_per_iteration": 3.3371469974517822 }, { "auxiliary_loss_clip": 0.01163808, "auxiliary_loss_mlp": 0.00778536, "balance_loss_clip": 1.05587018, "balance_loss_mlp": 1.00172091, "epoch": 0.16557943784758755, "flos": 24280389792000.0, "grad_norm": 1.6411537728312637, "language_loss": 0.75436741, "learning_rate": 3.8102539407376954e-06, "loss": 0.7737909, "num_input_tokens_seen": 59561155, "step": 2754, "time_per_iteration": 2.6382133960723877 }, { "auxiliary_loss_clip": 0.01139421, "auxiliary_loss_mlp": 0.01060584, "balance_loss_clip": 1.05406713, "balance_loss_mlp": 1.03768396, "epoch": 0.16563956110025552, "flos": 20083617561600.0, "grad_norm": 2.4067479946694137, "language_loss": 0.86654639, "learning_rate": 3.810088330151188e-06, "loss": 0.88854647, "num_input_tokens_seen": 59580460, "step": 2755, "time_per_iteration": 2.6590075492858887 }, { "auxiliary_loss_clip": 0.01122817, "auxiliary_loss_mlp": 0.01053169, "balance_loss_clip": 1.04948378, "balance_loss_mlp": 1.03293943, "epoch": 0.16569968435292348, "flos": 28034454896640.0, "grad_norm": 1.7268487777137649, "language_loss": 0.73350251, "learning_rate": 3.80992265092595e-06, "loss": 0.75526237, "num_input_tokens_seen": 59600025, "step": 2756, "time_per_iteration": 2.771820545196533 }, { "auxiliary_loss_clip": 0.01128662, "auxiliary_loss_mlp": 0.01049666, "balance_loss_clip": 1.05550277, "balance_loss_mlp": 1.02969813, "epoch": 0.16575980760559147, "flos": 26250233598720.0, "grad_norm": 1.5540667033085804, "language_loss": 0.75308084, "learning_rate": 3.8097569030682636e-06, "loss": 0.77486414, "num_input_tokens_seen": 59620600, "step": 2757, "time_per_iteration": 2.8106157779693604 }, { "auxiliary_loss_clip": 0.01143608, "auxiliary_loss_mlp": 0.01054064, "balance_loss_clip": 1.057634, "balance_loss_mlp": 1.03390563, "epoch": 0.16581993085825944, "flos": 26943955943040.0, "grad_norm": 1.8675154897424497, "language_loss": 0.84604371, "learning_rate": 3.8095910865844137e-06, "loss": 0.86802036, "num_input_tokens_seen": 59641385, "step": 2758, "time_per_iteration": 2.8663368225097656 }, { "auxiliary_loss_clip": 0.01168186, "auxiliary_loss_mlp": 0.01058337, "balance_loss_clip": 1.06166434, "balance_loss_mlp": 1.03952527, "epoch": 0.1658800541109274, "flos": 21653632103040.0, "grad_norm": 2.0824774555850243, "language_loss": 0.78848934, "learning_rate": 3.809425201480689e-06, "loss": 0.81075454, "num_input_tokens_seen": 59659865, "step": 2759, "time_per_iteration": 2.655371904373169 }, { "auxiliary_loss_clip": 0.01098973, "auxiliary_loss_mlp": 0.0104879, "balance_loss_clip": 1.0491066, "balance_loss_mlp": 1.02846527, "epoch": 0.16594017736359537, "flos": 16435488643200.0, "grad_norm": 2.4005603702739613, "language_loss": 0.75130272, "learning_rate": 3.8092592477633793e-06, "loss": 0.77278036, "num_input_tokens_seen": 59678780, "step": 2760, "time_per_iteration": 2.767866611480713 }, { "auxiliary_loss_clip": 0.01117278, "auxiliary_loss_mlp": 0.0104823, "balance_loss_clip": 1.05129814, "balance_loss_mlp": 1.02867997, "epoch": 0.16600030061626334, "flos": 22637297030400.0, "grad_norm": 1.5792623632565632, "language_loss": 0.73425764, "learning_rate": 3.8090932254387774e-06, "loss": 0.75591272, "num_input_tokens_seen": 59698795, "step": 2761, "time_per_iteration": 2.762836456298828 }, { "auxiliary_loss_clip": 0.0113507, "auxiliary_loss_mlp": 0.01050415, "balance_loss_clip": 1.05250192, "balance_loss_mlp": 1.03018475, "epoch": 0.1660604238689313, "flos": 26396569607040.0, "grad_norm": 2.9515424803015033, "language_loss": 0.88832974, "learning_rate": 3.8089271345131788e-06, "loss": 0.91018462, "num_input_tokens_seen": 59718795, "step": 2762, "time_per_iteration": 2.766324281692505 }, { "auxiliary_loss_clip": 0.01115163, "auxiliary_loss_mlp": 0.01050144, "balance_loss_clip": 1.05208707, "balance_loss_mlp": 1.03080845, "epoch": 0.16612054712159927, "flos": 23039999383680.0, "grad_norm": 1.84507980271118, "language_loss": 0.87992418, "learning_rate": 3.8087609749928822e-06, "loss": 0.90157735, "num_input_tokens_seen": 59737555, "step": 2763, "time_per_iteration": 2.7734055519104004 }, { "auxiliary_loss_clip": 0.01086152, "auxiliary_loss_mlp": 0.01013622, "balance_loss_clip": 1.0448606, "balance_loss_mlp": 1.01065338, "epoch": 0.16618067037426726, "flos": 59241225202560.0, "grad_norm": 0.7790832079967882, "language_loss": 0.59799927, "learning_rate": 3.8085947468841885e-06, "loss": 0.61899698, "num_input_tokens_seen": 59800915, "step": 2764, "time_per_iteration": 3.1728692054748535 }, { "auxiliary_loss_clip": 0.01152232, "auxiliary_loss_mlp": 0.01053607, "balance_loss_clip": 1.05467176, "balance_loss_mlp": 1.03254318, "epoch": 0.16624079362693522, "flos": 27198813916800.0, "grad_norm": 1.7436496772383425, "language_loss": 0.82260036, "learning_rate": 3.808428450193401e-06, "loss": 0.84465873, "num_input_tokens_seen": 59822910, "step": 2765, "time_per_iteration": 2.72440767288208 }, { "auxiliary_loss_clip": 0.01171844, "auxiliary_loss_mlp": 0.01049085, "balance_loss_clip": 1.05882454, "balance_loss_mlp": 1.02746069, "epoch": 0.1663009168796032, "flos": 10925068216320.0, "grad_norm": 2.128015994498251, "language_loss": 0.69980019, "learning_rate": 3.8082620849268244e-06, "loss": 0.72200948, "num_input_tokens_seen": 59838805, "step": 2766, "time_per_iteration": 2.5810647010803223 }, { "auxiliary_loss_clip": 0.0115036, "auxiliary_loss_mlp": 0.01047665, "balance_loss_clip": 1.05772817, "balance_loss_mlp": 1.02792454, "epoch": 0.16636104013227115, "flos": 17894431353600.0, "grad_norm": 2.107381123394178, "language_loss": 0.8845337, "learning_rate": 3.808095651090769e-06, "loss": 0.90651393, "num_input_tokens_seen": 59855345, "step": 2767, "time_per_iteration": 2.659240245819092 }, { "auxiliary_loss_clip": 0.01077283, "auxiliary_loss_mlp": 0.01002999, "balance_loss_clip": 1.046556, "balance_loss_mlp": 1.00020981, "epoch": 0.16642116338493912, "flos": 66726050463360.0, "grad_norm": 0.6403612433239105, "language_loss": 0.5289067, "learning_rate": 3.8079291486915447e-06, "loss": 0.54970956, "num_input_tokens_seen": 59917710, "step": 2768, "time_per_iteration": 3.28488826751709 }, { "auxiliary_loss_clip": 0.01137637, "auxiliary_loss_mlp": 0.01051692, "balance_loss_clip": 1.05451822, "balance_loss_mlp": 1.03034163, "epoch": 0.16648128663760708, "flos": 19026048401280.0, "grad_norm": 2.4342686570828267, "language_loss": 0.84962058, "learning_rate": 3.8077625777354667e-06, "loss": 0.87151396, "num_input_tokens_seen": 59935105, "step": 2769, "time_per_iteration": 2.753257989883423 }, { "auxiliary_loss_clip": 0.01068987, "auxiliary_loss_mlp": 0.0100573, "balance_loss_clip": 1.04678345, "balance_loss_mlp": 1.00316668, "epoch": 0.16654140989027508, "flos": 70134976759680.0, "grad_norm": 0.8107434108728753, "language_loss": 0.57455683, "learning_rate": 3.80759593822885e-06, "loss": 0.59530401, "num_input_tokens_seen": 59984085, "step": 2770, "time_per_iteration": 3.2202906608581543 }, { "auxiliary_loss_clip": 0.01054548, "auxiliary_loss_mlp": 0.01003676, "balance_loss_clip": 1.04637623, "balance_loss_mlp": 1.00086308, "epoch": 0.16660153314294304, "flos": 70272406195200.0, "grad_norm": 0.8940719168038874, "language_loss": 0.56241393, "learning_rate": 3.807429230178015e-06, "loss": 0.58299619, "num_input_tokens_seen": 60043470, "step": 2771, "time_per_iteration": 3.3302085399627686 }, { "auxiliary_loss_clip": 0.01110714, "auxiliary_loss_mlp": 0.01053994, "balance_loss_clip": 1.04819679, "balance_loss_mlp": 1.03316772, "epoch": 0.166661656395611, "flos": 23075048079360.0, "grad_norm": 2.9137693497887778, "language_loss": 0.70419657, "learning_rate": 3.8072624535892817e-06, "loss": 0.72584367, "num_input_tokens_seen": 60063045, "step": 2772, "time_per_iteration": 2.845414161682129 }, { "auxiliary_loss_clip": 0.0114592, "auxiliary_loss_mlp": 0.01049708, "balance_loss_clip": 1.05082583, "balance_loss_mlp": 1.02923954, "epoch": 0.16672177964827897, "flos": 28366341586560.0, "grad_norm": 2.20945076195277, "language_loss": 0.86324167, "learning_rate": 3.807095608468975e-06, "loss": 0.88519788, "num_input_tokens_seen": 60081945, "step": 2773, "time_per_iteration": 2.669412851333618 }, { "auxiliary_loss_clip": 0.01095425, "auxiliary_loss_mlp": 0.01049097, "balance_loss_clip": 1.04436934, "balance_loss_mlp": 1.0300827, "epoch": 0.16678190290094694, "flos": 19091010147840.0, "grad_norm": 2.0211952616678937, "language_loss": 0.82141376, "learning_rate": 3.8069286948234224e-06, "loss": 0.84285897, "num_input_tokens_seen": 60096820, "step": 2774, "time_per_iteration": 2.7111308574676514 }, { "auxiliary_loss_clip": 0.01123493, "auxiliary_loss_mlp": 0.01045144, "balance_loss_clip": 1.05252421, "balance_loss_mlp": 1.02446127, "epoch": 0.1668420261536149, "flos": 21799106184960.0, "grad_norm": 3.3781068524499, "language_loss": 0.8298822, "learning_rate": 3.806761712658952e-06, "loss": 0.85156858, "num_input_tokens_seen": 60116140, "step": 2775, "time_per_iteration": 2.7367632389068604 }, { "auxiliary_loss_clip": 0.01150495, "auxiliary_loss_mlp": 0.01051475, "balance_loss_clip": 1.05761933, "balance_loss_mlp": 1.03264022, "epoch": 0.16690214940628287, "flos": 19062533640960.0, "grad_norm": 1.8115651629444076, "language_loss": 0.80919641, "learning_rate": 3.806594661981897e-06, "loss": 0.8312161, "num_input_tokens_seen": 60134235, "step": 2776, "time_per_iteration": 2.651723623275757 }, { "auxiliary_loss_clip": 0.0113775, "auxiliary_loss_mlp": 0.01054199, "balance_loss_clip": 1.05518723, "balance_loss_mlp": 1.0346483, "epoch": 0.16696227265895086, "flos": 18588548747520.0, "grad_norm": 2.7510345221850336, "language_loss": 0.80203485, "learning_rate": 3.8064275427985906e-06, "loss": 0.82395434, "num_input_tokens_seen": 60153275, "step": 2777, "time_per_iteration": 2.6380929946899414 }, { "auxiliary_loss_clip": 0.01147967, "auxiliary_loss_mlp": 0.01045166, "balance_loss_clip": 1.05270481, "balance_loss_mlp": 1.02640271, "epoch": 0.16702239591161883, "flos": 23294139085440.0, "grad_norm": 1.6179722336290305, "language_loss": 0.85384095, "learning_rate": 3.806260355115371e-06, "loss": 0.87577224, "num_input_tokens_seen": 60173215, "step": 2778, "time_per_iteration": 2.754652500152588 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.0531714, "balance_loss_mlp": 1.02148652, "epoch": 0.1670825191642868, "flos": 24425648392320.0, "grad_norm": 3.2091470007324414, "language_loss": 0.74180603, "learning_rate": 3.8060930989385778e-06, "loss": 0.76358056, "num_input_tokens_seen": 60190515, "step": 2779, "time_per_iteration": 2.777193784713745 }, { "auxiliary_loss_clip": 0.01112683, "auxiliary_loss_mlp": 0.00777451, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.0015173, "epoch": 0.16714264241695476, "flos": 26797512193920.0, "grad_norm": 2.127789274190337, "language_loss": 0.6557346, "learning_rate": 3.805925774274554e-06, "loss": 0.67463589, "num_input_tokens_seen": 60211655, "step": 2780, "time_per_iteration": 2.896976947784424 }, { "auxiliary_loss_clip": 0.01120921, "auxiliary_loss_mlp": 0.01045506, "balance_loss_clip": 1.04843462, "balance_loss_mlp": 1.02547836, "epoch": 0.16720276566962272, "flos": 21835304115840.0, "grad_norm": 2.46647860258999, "language_loss": 0.78422606, "learning_rate": 3.805758381129643e-06, "loss": 0.80589032, "num_input_tokens_seen": 60230860, "step": 2781, "time_per_iteration": 2.725782632827759 }, { "auxiliary_loss_clip": 0.01094692, "auxiliary_loss_mlp": 0.01050104, "balance_loss_clip": 1.04439843, "balance_loss_mlp": 1.03056526, "epoch": 0.1672628889222907, "flos": 21470415805440.0, "grad_norm": 26.23767952829368, "language_loss": 0.75119764, "learning_rate": 3.805590919510193e-06, "loss": 0.77264553, "num_input_tokens_seen": 60250535, "step": 2782, "time_per_iteration": 2.7064197063446045 }, { "auxiliary_loss_clip": 0.01129162, "auxiliary_loss_mlp": 0.01047612, "balance_loss_clip": 1.05152631, "balance_loss_mlp": 1.02764392, "epoch": 0.16732301217495865, "flos": 30774008269440.0, "grad_norm": 2.116531296279042, "language_loss": 0.67398441, "learning_rate": 3.8054233894225547e-06, "loss": 0.69575214, "num_input_tokens_seen": 60269530, "step": 2783, "time_per_iteration": 2.7901556491851807 }, { "auxiliary_loss_clip": 0.01158882, "auxiliary_loss_mlp": 0.0105166, "balance_loss_clip": 1.05460215, "balance_loss_mlp": 1.03271747, "epoch": 0.16738313542762664, "flos": 23474625949440.0, "grad_norm": 1.7768362036873409, "language_loss": 0.69919086, "learning_rate": 3.805255790873081e-06, "loss": 0.72129631, "num_input_tokens_seen": 60289900, "step": 2784, "time_per_iteration": 5.714844226837158 }, { "auxiliary_loss_clip": 0.01137618, "auxiliary_loss_mlp": 0.01056022, "balance_loss_clip": 1.05217624, "balance_loss_mlp": 1.03539932, "epoch": 0.1674432586802946, "flos": 29789086366080.0, "grad_norm": 4.741795209709136, "language_loss": 0.60970068, "learning_rate": 3.805088123868126e-06, "loss": 0.6316371, "num_input_tokens_seen": 60310025, "step": 2785, "time_per_iteration": 4.219547510147095 }, { "auxiliary_loss_clip": 0.01057886, "auxiliary_loss_mlp": 0.0100398, "balance_loss_clip": 1.03758883, "balance_loss_mlp": 1.00141752, "epoch": 0.16750338193296258, "flos": 66136073575680.0, "grad_norm": 0.773077721474628, "language_loss": 0.58780885, "learning_rate": 3.8049203884140492e-06, "loss": 0.60842752, "num_input_tokens_seen": 60377800, "step": 2786, "time_per_iteration": 3.2306320667266846 }, { "auxiliary_loss_clip": 0.0113927, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.0496738, "balance_loss_mlp": 1.02589226, "epoch": 0.16756350518563054, "flos": 25696777864320.0, "grad_norm": 1.7333132735183339, "language_loss": 0.76308596, "learning_rate": 3.80475258451721e-06, "loss": 0.78492826, "num_input_tokens_seen": 60398215, "step": 2787, "time_per_iteration": 2.6434125900268555 }, { "auxiliary_loss_clip": 0.01146924, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.0529089, "balance_loss_mlp": 1.02544546, "epoch": 0.1676236284382985, "flos": 23836102467840.0, "grad_norm": 1.7210472408736244, "language_loss": 0.7717936, "learning_rate": 3.804584712183972e-06, "loss": 0.79369676, "num_input_tokens_seen": 60416910, "step": 2788, "time_per_iteration": 4.359618425369263 }, { "auxiliary_loss_clip": 0.01054629, "auxiliary_loss_mlp": 0.00999991, "balance_loss_clip": 1.03482509, "balance_loss_mlp": 0.99746382, "epoch": 0.16768375169096647, "flos": 59874902985600.0, "grad_norm": 0.8596744797543817, "language_loss": 0.59331679, "learning_rate": 3.8044167714207013e-06, "loss": 0.61386299, "num_input_tokens_seen": 60468660, "step": 2789, "time_per_iteration": 3.0742650032043457 }, { "auxiliary_loss_clip": 0.01148272, "auxiliary_loss_mlp": 0.01053856, "balance_loss_clip": 1.05450928, "balance_loss_mlp": 1.03428209, "epoch": 0.16774387494363446, "flos": 38435657207040.0, "grad_norm": 1.689036486923415, "language_loss": 0.7012763, "learning_rate": 3.804248762233765e-06, "loss": 0.7232976, "num_input_tokens_seen": 60492370, "step": 2790, "time_per_iteration": 2.872232437133789 }, { "auxiliary_loss_clip": 0.0112492, "auxiliary_loss_mlp": 0.01051622, "balance_loss_clip": 1.0497216, "balance_loss_mlp": 1.0334661, "epoch": 0.16780399819630243, "flos": 22637620252800.0, "grad_norm": 1.864386369112868, "language_loss": 0.79464513, "learning_rate": 3.8040806846295356e-06, "loss": 0.81641054, "num_input_tokens_seen": 60512655, "step": 2791, "time_per_iteration": 2.7180140018463135 }, { "auxiliary_loss_clip": 0.01122456, "auxiliary_loss_mlp": 0.01050939, "balance_loss_clip": 1.04977369, "balance_loss_mlp": 1.03106701, "epoch": 0.1678641214489704, "flos": 32891516887680.0, "grad_norm": 1.705849915566178, "language_loss": 0.71547955, "learning_rate": 3.8039125386143853e-06, "loss": 0.73721349, "num_input_tokens_seen": 60533090, "step": 2792, "time_per_iteration": 2.9221818447113037 }, { "auxiliary_loss_clip": 0.01131469, "auxiliary_loss_mlp": 0.01044061, "balance_loss_clip": 1.05479562, "balance_loss_mlp": 1.02551246, "epoch": 0.16792424470163836, "flos": 19974916028160.0, "grad_norm": 1.9301593564774673, "language_loss": 0.71581644, "learning_rate": 3.803744324194691e-06, "loss": 0.73757172, "num_input_tokens_seen": 60553190, "step": 2793, "time_per_iteration": 2.75104022026062 }, { "auxiliary_loss_clip": 0.01143072, "auxiliary_loss_mlp": 0.01053231, "balance_loss_clip": 1.05276942, "balance_loss_mlp": 1.03452659, "epoch": 0.16798436795430632, "flos": 19719878486400.0, "grad_norm": 2.3859650274226833, "language_loss": 0.7717455, "learning_rate": 3.803576041376831e-06, "loss": 0.79370856, "num_input_tokens_seen": 60571995, "step": 2794, "time_per_iteration": 2.6007745265960693 }, { "auxiliary_loss_clip": 0.01137828, "auxiliary_loss_mlp": 0.0104987, "balance_loss_clip": 1.05250025, "balance_loss_mlp": 1.03010476, "epoch": 0.1680444912069743, "flos": 28104839596800.0, "grad_norm": 2.7692472240964747, "language_loss": 0.71609265, "learning_rate": 3.803407690167187e-06, "loss": 0.73796958, "num_input_tokens_seen": 60591275, "step": 2795, "time_per_iteration": 2.693826198577881 }, { "auxiliary_loss_clip": 0.01131865, "auxiliary_loss_mlp": 0.01041012, "balance_loss_clip": 1.04973865, "balance_loss_mlp": 1.02302384, "epoch": 0.16810461445964225, "flos": 18075205526400.0, "grad_norm": 1.990096863808903, "language_loss": 0.84230494, "learning_rate": 3.803239270572142e-06, "loss": 0.8640337, "num_input_tokens_seen": 60609235, "step": 2796, "time_per_iteration": 2.697253465652466 }, { "auxiliary_loss_clip": 0.01101634, "auxiliary_loss_mlp": 0.01045196, "balance_loss_clip": 1.04877055, "balance_loss_mlp": 1.0262773, "epoch": 0.16816473771231025, "flos": 23878657105920.0, "grad_norm": 1.9272276676322646, "language_loss": 0.81609607, "learning_rate": 3.8030707825980838e-06, "loss": 0.83756441, "num_input_tokens_seen": 60629880, "step": 2797, "time_per_iteration": 2.8784244060516357 }, { "auxiliary_loss_clip": 0.0114057, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.05136061, "balance_loss_mlp": 1.02448523, "epoch": 0.1682248609649782, "flos": 22783597125120.0, "grad_norm": 1.7015769336052518, "language_loss": 0.74811113, "learning_rate": 3.802902226251401e-06, "loss": 0.76992965, "num_input_tokens_seen": 60651175, "step": 2798, "time_per_iteration": 2.700727939605713 }, { "auxiliary_loss_clip": 0.01161342, "auxiliary_loss_mlp": 0.01048462, "balance_loss_clip": 1.05728281, "balance_loss_mlp": 1.03075945, "epoch": 0.16828498421764618, "flos": 20705123612160.0, "grad_norm": 1.5964091182578661, "language_loss": 0.79693568, "learning_rate": 3.8027336015384845e-06, "loss": 0.81903368, "num_input_tokens_seen": 60670210, "step": 2799, "time_per_iteration": 2.6582021713256836 }, { "auxiliary_loss_clip": 0.01077177, "auxiliary_loss_mlp": 0.01045216, "balance_loss_clip": 1.04514158, "balance_loss_mlp": 1.02374637, "epoch": 0.16834510747031414, "flos": 29420606695680.0, "grad_norm": 4.227726163531211, "language_loss": 0.70963746, "learning_rate": 3.8025649084657296e-06, "loss": 0.73086143, "num_input_tokens_seen": 60690895, "step": 2800, "time_per_iteration": 2.8856699466705322 }, { "auxiliary_loss_clip": 0.01108822, "auxiliary_loss_mlp": 0.00777078, "balance_loss_clip": 1.04776788, "balance_loss_mlp": 1.00161195, "epoch": 0.1684052307229821, "flos": 18145374744960.0, "grad_norm": 1.9902029671619985, "language_loss": 0.83663505, "learning_rate": 3.8023961470395326e-06, "loss": 0.85549408, "num_input_tokens_seen": 60708280, "step": 2801, "time_per_iteration": 2.6917035579681396 }, { "auxiliary_loss_clip": 0.01128148, "auxiliary_loss_mlp": 0.01049324, "balance_loss_clip": 1.05011535, "balance_loss_mlp": 1.03084683, "epoch": 0.16846535397565007, "flos": 16574929240320.0, "grad_norm": 2.4052305427948735, "language_loss": 0.82509923, "learning_rate": 3.8022273172662933e-06, "loss": 0.84687394, "num_input_tokens_seen": 60724150, "step": 2802, "time_per_iteration": 2.882611036300659 }, { "auxiliary_loss_clip": 0.01150156, "auxiliary_loss_mlp": 0.01048717, "balance_loss_clip": 1.05517435, "balance_loss_mlp": 1.02885723, "epoch": 0.16852547722831807, "flos": 30408868563840.0, "grad_norm": 3.107584498439891, "language_loss": 0.80643189, "learning_rate": 3.802058419152413e-06, "loss": 0.8284207, "num_input_tokens_seen": 60746485, "step": 2803, "time_per_iteration": 2.7886922359466553 }, { "auxiliary_loss_clip": 0.01148107, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.0556829, "balance_loss_mlp": 1.02918339, "epoch": 0.16858560048098603, "flos": 33507420416640.0, "grad_norm": 2.2127389669880713, "language_loss": 0.76168799, "learning_rate": 3.801889452704297e-06, "loss": 0.7836476, "num_input_tokens_seen": 60762875, "step": 2804, "time_per_iteration": 2.7588601112365723 }, { "auxiliary_loss_clip": 0.01045171, "auxiliary_loss_mlp": 0.01013955, "balance_loss_clip": 1.03581083, "balance_loss_mlp": 1.01078367, "epoch": 0.168645723733654, "flos": 67370502326400.0, "grad_norm": 0.8536034833258724, "language_loss": 0.55464876, "learning_rate": 3.8017204179283526e-06, "loss": 0.57524002, "num_input_tokens_seen": 60825510, "step": 2805, "time_per_iteration": 3.2089412212371826 }, { "auxiliary_loss_clip": 0.01138275, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.05013156, "balance_loss_mlp": 1.02239537, "epoch": 0.16870584698632196, "flos": 21324618501120.0, "grad_norm": 2.2836767274778427, "language_loss": 0.73090243, "learning_rate": 3.8015513148309892e-06, "loss": 0.75268269, "num_input_tokens_seen": 60844440, "step": 2806, "time_per_iteration": 2.643596649169922 }, { "auxiliary_loss_clip": 0.01117063, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.05330753, "balance_loss_mlp": 1.02766335, "epoch": 0.16876597023898993, "flos": 20740746925440.0, "grad_norm": 1.8406859431587912, "language_loss": 0.69773197, "learning_rate": 3.80138214341862e-06, "loss": 0.71935666, "num_input_tokens_seen": 60863210, "step": 2807, "time_per_iteration": 2.6946568489074707 }, { "auxiliary_loss_clip": 0.01130702, "auxiliary_loss_mlp": 0.01047199, "balance_loss_clip": 1.04842246, "balance_loss_mlp": 1.02794707, "epoch": 0.1688260934916579, "flos": 20303498666880.0, "grad_norm": 3.042021842274248, "language_loss": 0.70280695, "learning_rate": 3.8012129036976587e-06, "loss": 0.72458601, "num_input_tokens_seen": 60882510, "step": 2808, "time_per_iteration": 2.6656088829040527 }, { "auxiliary_loss_clip": 0.01119025, "auxiliary_loss_mlp": 0.01041739, "balance_loss_clip": 1.05019665, "balance_loss_mlp": 1.02164018, "epoch": 0.16888621674432586, "flos": 20340702178560.0, "grad_norm": 2.0835789337145965, "language_loss": 0.79903001, "learning_rate": 3.8010435956745236e-06, "loss": 0.8206377, "num_input_tokens_seen": 60901105, "step": 2809, "time_per_iteration": 2.7665679454803467 }, { "auxiliary_loss_clip": 0.01155146, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.0557605, "balance_loss_mlp": 1.02252758, "epoch": 0.16894633999699385, "flos": 16244802316800.0, "grad_norm": 2.0672093223845245, "language_loss": 0.88076419, "learning_rate": 3.8008742193556358e-06, "loss": 0.90273583, "num_input_tokens_seen": 60915340, "step": 2810, "time_per_iteration": 2.6186363697052 }, { "auxiliary_loss_clip": 0.01149997, "auxiliary_loss_mlp": 0.0104631, "balance_loss_clip": 1.05503082, "balance_loss_mlp": 1.02715337, "epoch": 0.16900646324966181, "flos": 19610171372160.0, "grad_norm": 1.8921026809528976, "language_loss": 0.92376304, "learning_rate": 3.800704774747416e-06, "loss": 0.9457261, "num_input_tokens_seen": 60933735, "step": 2811, "time_per_iteration": 2.6567442417144775 }, { "auxiliary_loss_clip": 0.01140053, "auxiliary_loss_mlp": 0.01049063, "balance_loss_clip": 1.05383325, "balance_loss_mlp": 1.03039432, "epoch": 0.16906658650232978, "flos": 22018089450240.0, "grad_norm": 2.116573413654177, "language_loss": 0.78582352, "learning_rate": 3.800535261856291e-06, "loss": 0.8077147, "num_input_tokens_seen": 60953105, "step": 2812, "time_per_iteration": 2.6796023845672607 }, { "auxiliary_loss_clip": 0.01147895, "auxiliary_loss_mlp": 0.01043917, "balance_loss_clip": 1.05772316, "balance_loss_mlp": 1.02653646, "epoch": 0.16912670975499774, "flos": 11763690024960.0, "grad_norm": 2.5483899062625093, "language_loss": 0.75195068, "learning_rate": 3.8003656806886887e-06, "loss": 0.7738688, "num_input_tokens_seen": 60969150, "step": 2813, "time_per_iteration": 2.621772050857544 }, { "auxiliary_loss_clip": 0.01136313, "auxiliary_loss_mlp": 0.01045037, "balance_loss_clip": 1.05311871, "balance_loss_mlp": 1.02599943, "epoch": 0.1691868330076657, "flos": 17161386595200.0, "grad_norm": 3.0041182480764554, "language_loss": 0.69118392, "learning_rate": 3.8001960312510396e-06, "loss": 0.7129975, "num_input_tokens_seen": 60982825, "step": 2814, "time_per_iteration": 2.837264060974121 }, { "auxiliary_loss_clip": 0.01163835, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.05900145, "balance_loss_mlp": 1.02134776, "epoch": 0.16924695626033368, "flos": 22416553998720.0, "grad_norm": 3.1079956206415833, "language_loss": 0.61439502, "learning_rate": 3.800026313549776e-06, "loss": 0.63643175, "num_input_tokens_seen": 61000875, "step": 2815, "time_per_iteration": 2.6967194080352783 }, { "auxiliary_loss_clip": 0.01129827, "auxiliary_loss_mlp": 0.01042692, "balance_loss_clip": 1.05139673, "balance_loss_mlp": 1.02382088, "epoch": 0.16930707951300164, "flos": 25739655724800.0, "grad_norm": 1.7930623183302479, "language_loss": 0.82490849, "learning_rate": 3.7998565275913342e-06, "loss": 0.84663367, "num_input_tokens_seen": 61021940, "step": 2816, "time_per_iteration": 2.7227163314819336 }, { "auxiliary_loss_clip": 0.01133129, "auxiliary_loss_mlp": 0.01047914, "balance_loss_clip": 1.05375743, "balance_loss_mlp": 1.02853012, "epoch": 0.16936720276566963, "flos": 22747040058240.0, "grad_norm": 3.083808689594852, "language_loss": 0.87322289, "learning_rate": 3.799686673382153e-06, "loss": 0.89503324, "num_input_tokens_seen": 61040285, "step": 2817, "time_per_iteration": 2.733180522918701 }, { "auxiliary_loss_clip": 0.01141455, "auxiliary_loss_mlp": 0.01052753, "balance_loss_clip": 1.05800366, "balance_loss_mlp": 1.03352427, "epoch": 0.1694273260183376, "flos": 19573973441280.0, "grad_norm": 1.8594303503608436, "language_loss": 0.81247765, "learning_rate": 3.799516750928672e-06, "loss": 0.83441973, "num_input_tokens_seen": 61059020, "step": 2818, "time_per_iteration": 2.7384097576141357 }, { "auxiliary_loss_clip": 0.01160132, "auxiliary_loss_mlp": 0.01044196, "balance_loss_clip": 1.05699944, "balance_loss_mlp": 1.02496791, "epoch": 0.16948744927100556, "flos": 12457843332480.0, "grad_norm": 2.739998367204505, "language_loss": 0.80788404, "learning_rate": 3.799346760237336e-06, "loss": 0.82992733, "num_input_tokens_seen": 61074245, "step": 2819, "time_per_iteration": 2.609870672225952 }, { "auxiliary_loss_clip": 0.01069019, "auxiliary_loss_mlp": 0.01015301, "balance_loss_clip": 1.0485003, "balance_loss_mlp": 1.0125947, "epoch": 0.16954757252367353, "flos": 71291694435840.0, "grad_norm": 0.9309223426502673, "language_loss": 0.61031163, "learning_rate": 3.7991767013145902e-06, "loss": 0.63115478, "num_input_tokens_seen": 61127080, "step": 2820, "time_per_iteration": 3.161051034927368 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.05106986, "balance_loss_mlp": 1.03207326, "epoch": 0.1696076957763415, "flos": 29606516513280.0, "grad_norm": 1.8682266790688726, "language_loss": 0.78265435, "learning_rate": 3.7990065741668844e-06, "loss": 0.80441403, "num_input_tokens_seen": 61146955, "step": 2821, "time_per_iteration": 2.838730573654175 }, { "auxiliary_loss_clip": 0.0113863, "auxiliary_loss_mlp": 0.01055528, "balance_loss_clip": 1.05282724, "balance_loss_mlp": 1.03494084, "epoch": 0.16966781902900946, "flos": 24388588535040.0, "grad_norm": 2.1667405259997516, "language_loss": 0.78521514, "learning_rate": 3.7988363788006685e-06, "loss": 0.80715668, "num_input_tokens_seen": 61166605, "step": 2822, "time_per_iteration": 2.783385753631592 }, { "auxiliary_loss_clip": 0.01143597, "auxiliary_loss_mlp": 0.00777154, "balance_loss_clip": 1.05367076, "balance_loss_mlp": 1.00129986, "epoch": 0.16972794228167745, "flos": 23038814234880.0, "grad_norm": 1.8038457392731222, "language_loss": 0.74939907, "learning_rate": 3.7986661152223967e-06, "loss": 0.76860654, "num_input_tokens_seen": 61186535, "step": 2823, "time_per_iteration": 4.329328298568726 }, { "auxiliary_loss_clip": 0.01129469, "auxiliary_loss_mlp": 0.0105385, "balance_loss_clip": 1.05166912, "balance_loss_mlp": 1.03496754, "epoch": 0.16978806553434542, "flos": 35228691129600.0, "grad_norm": 3.336653609493179, "language_loss": 0.60266119, "learning_rate": 3.7984957834385257e-06, "loss": 0.62449437, "num_input_tokens_seen": 61208965, "step": 2824, "time_per_iteration": 5.892346620559692 }, { "auxiliary_loss_clip": 0.01138249, "auxiliary_loss_mlp": 0.01042322, "balance_loss_clip": 1.05565047, "balance_loss_mlp": 1.02287912, "epoch": 0.16984818878701338, "flos": 32014290936960.0, "grad_norm": 2.152838804074104, "language_loss": 0.73322558, "learning_rate": 3.7983253834555144e-06, "loss": 0.75503135, "num_input_tokens_seen": 61230670, "step": 2825, "time_per_iteration": 2.834482431411743 }, { "auxiliary_loss_clip": 0.01161467, "auxiliary_loss_mlp": 0.01047701, "balance_loss_clip": 1.05502653, "balance_loss_mlp": 1.02762675, "epoch": 0.16990831203968135, "flos": 22818609907200.0, "grad_norm": 2.05671259677731, "language_loss": 0.85638934, "learning_rate": 3.7981549152798245e-06, "loss": 0.87848103, "num_input_tokens_seen": 61249510, "step": 2826, "time_per_iteration": 2.6443135738372803 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01047749, "balance_loss_clip": 1.05266595, "balance_loss_mlp": 1.02856779, "epoch": 0.1699684352923493, "flos": 23039604334080.0, "grad_norm": 1.9562557148441426, "language_loss": 0.82465482, "learning_rate": 3.7979843789179196e-06, "loss": 0.84652597, "num_input_tokens_seen": 61269440, "step": 2827, "time_per_iteration": 2.7683157920837402 }, { "auxiliary_loss_clip": 0.01131885, "auxiliary_loss_mlp": 0.0104561, "balance_loss_clip": 1.05320346, "balance_loss_mlp": 1.02536786, "epoch": 0.17002855854501728, "flos": 21434110133760.0, "grad_norm": 1.7386401818136152, "language_loss": 0.73704529, "learning_rate": 3.797813774376267e-06, "loss": 0.75882024, "num_input_tokens_seen": 61288195, "step": 2828, "time_per_iteration": 4.465311288833618 }, { "auxiliary_loss_clip": 0.01061458, "auxiliary_loss_mlp": 0.01009538, "balance_loss_clip": 1.04764342, "balance_loss_mlp": 1.00620067, "epoch": 0.17008868179768524, "flos": 71453509205760.0, "grad_norm": 0.7670168832041738, "language_loss": 0.56426483, "learning_rate": 3.797643101661336e-06, "loss": 0.58497471, "num_input_tokens_seen": 61350850, "step": 2829, "time_per_iteration": 3.3114631175994873 }, { "auxiliary_loss_clip": 0.01111753, "auxiliary_loss_mlp": 0.01051557, "balance_loss_clip": 1.04527223, "balance_loss_mlp": 1.03088641, "epoch": 0.17014880505035324, "flos": 24900315644160.0, "grad_norm": 1.7961285206560338, "language_loss": 0.83465374, "learning_rate": 3.7974723607795983e-06, "loss": 0.85628688, "num_input_tokens_seen": 61370765, "step": 2830, "time_per_iteration": 2.795253038406372 }, { "auxiliary_loss_clip": 0.01121533, "auxiliary_loss_mlp": 0.0104408, "balance_loss_clip": 1.04901659, "balance_loss_mlp": 1.02442193, "epoch": 0.1702089283030212, "flos": 29862415981440.0, "grad_norm": 2.4873654173451727, "language_loss": 0.78360993, "learning_rate": 3.797301551737529e-06, "loss": 0.80526608, "num_input_tokens_seen": 61388935, "step": 2831, "time_per_iteration": 2.7864232063293457 }, { "auxiliary_loss_clip": 0.01123612, "auxiliary_loss_mlp": 0.01051154, "balance_loss_clip": 1.05275893, "balance_loss_mlp": 1.0311985, "epoch": 0.17026905155568917, "flos": 17744180762880.0, "grad_norm": 2.532473263441992, "language_loss": 0.79668158, "learning_rate": 3.7971306745416044e-06, "loss": 0.81842923, "num_input_tokens_seen": 61407350, "step": 2832, "time_per_iteration": 2.842217206954956 }, { "auxiliary_loss_clip": 0.01127135, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.05029321, "balance_loss_mlp": 1.02984488, "epoch": 0.17032917480835713, "flos": 23148665003520.0, "grad_norm": 1.8387196201649116, "language_loss": 0.88638175, "learning_rate": 3.7969597291983046e-06, "loss": 0.90814275, "num_input_tokens_seen": 61429010, "step": 2833, "time_per_iteration": 2.75942325592041 }, { "auxiliary_loss_clip": 0.01158799, "auxiliary_loss_mlp": 0.01046883, "balance_loss_clip": 1.05633831, "balance_loss_mlp": 1.02842951, "epoch": 0.1703892980610251, "flos": 39202565512320.0, "grad_norm": 2.49094605220443, "language_loss": 0.71924698, "learning_rate": 3.7967887157141115e-06, "loss": 0.74130386, "num_input_tokens_seen": 61450040, "step": 2834, "time_per_iteration": 2.9035184383392334 }, { "auxiliary_loss_clip": 0.01119873, "auxiliary_loss_mlp": 0.01052215, "balance_loss_clip": 1.05165124, "balance_loss_mlp": 1.03428626, "epoch": 0.17044942131369306, "flos": 23039101543680.0, "grad_norm": 1.9093816511111852, "language_loss": 0.86831236, "learning_rate": 3.7966176340955106e-06, "loss": 0.89003325, "num_input_tokens_seen": 61468585, "step": 2835, "time_per_iteration": 2.7627484798431396 }, { "auxiliary_loss_clip": 0.01149332, "auxiliary_loss_mlp": 0.01049844, "balance_loss_clip": 1.0536654, "balance_loss_mlp": 1.02887547, "epoch": 0.17050954456636103, "flos": 17054983532160.0, "grad_norm": 2.1227367002258153, "language_loss": 0.74483943, "learning_rate": 3.796446484348989e-06, "loss": 0.76683116, "num_input_tokens_seen": 61486330, "step": 2836, "time_per_iteration": 2.6748619079589844 }, { "auxiliary_loss_clip": 0.01102249, "auxiliary_loss_mlp": 0.01049533, "balance_loss_clip": 1.04775679, "balance_loss_mlp": 1.02790809, "epoch": 0.17056966781902902, "flos": 16836969934080.0, "grad_norm": 2.1718385109372824, "language_loss": 0.79959226, "learning_rate": 3.796275266481036e-06, "loss": 0.82111007, "num_input_tokens_seen": 61503950, "step": 2837, "time_per_iteration": 2.757340908050537 }, { "auxiliary_loss_clip": 0.01144378, "auxiliary_loss_mlp": 0.01044803, "balance_loss_clip": 1.05493581, "balance_loss_mlp": 1.02644491, "epoch": 0.17062979107169698, "flos": 17712543859200.0, "grad_norm": 1.6825251002952497, "language_loss": 0.83258498, "learning_rate": 3.7961039804981456e-06, "loss": 0.85447681, "num_input_tokens_seen": 61523550, "step": 2838, "time_per_iteration": 2.705357551574707 }, { "auxiliary_loss_clip": 0.0110604, "auxiliary_loss_mlp": 0.01044889, "balance_loss_clip": 1.05217135, "balance_loss_mlp": 1.02685261, "epoch": 0.17068991432436495, "flos": 22525040050560.0, "grad_norm": 1.7789799751303759, "language_loss": 0.93788463, "learning_rate": 3.795932626406812e-06, "loss": 0.95939398, "num_input_tokens_seen": 61542720, "step": 2839, "time_per_iteration": 2.7881791591644287 }, { "auxiliary_loss_clip": 0.01126465, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05183244, "balance_loss_mlp": 1.0250175, "epoch": 0.17075003757703291, "flos": 25882939077120.0, "grad_norm": 2.3337760403585435, "language_loss": 0.83974946, "learning_rate": 3.7957612042135336e-06, "loss": 0.86147022, "num_input_tokens_seen": 61563040, "step": 2840, "time_per_iteration": 2.7564892768859863 }, { "auxiliary_loss_clip": 0.01151834, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.05555129, "balance_loss_mlp": 1.02449679, "epoch": 0.17081016082970088, "flos": 20120713332480.0, "grad_norm": 1.9037435592597944, "language_loss": 0.76307738, "learning_rate": 3.79558971392481e-06, "loss": 0.7850399, "num_input_tokens_seen": 61581890, "step": 2841, "time_per_iteration": 2.695525646209717 }, { "auxiliary_loss_clip": 0.01136217, "auxiliary_loss_mlp": 0.01045847, "balance_loss_clip": 1.0527097, "balance_loss_mlp": 1.02744126, "epoch": 0.17087028408236885, "flos": 24936477661440.0, "grad_norm": 1.7844240011089845, "language_loss": 0.77076876, "learning_rate": 3.7954181555471443e-06, "loss": 0.79258937, "num_input_tokens_seen": 61602095, "step": 2842, "time_per_iteration": 2.773792266845703 }, { "auxiliary_loss_clip": 0.01155915, "auxiliary_loss_mlp": 0.01043896, "balance_loss_clip": 1.05616069, "balance_loss_mlp": 1.02503705, "epoch": 0.17093040733503684, "flos": 19057864872960.0, "grad_norm": 1.8430349199993477, "language_loss": 0.85694385, "learning_rate": 3.795246529087043e-06, "loss": 0.87894201, "num_input_tokens_seen": 61620400, "step": 2843, "time_per_iteration": 2.5860671997070312 }, { "auxiliary_loss_clip": 0.01154742, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.05549574, "balance_loss_mlp": 1.02608204, "epoch": 0.1709905305877048, "flos": 13078954333440.0, "grad_norm": 2.0353470349004485, "language_loss": 0.68646181, "learning_rate": 3.7950748345510126e-06, "loss": 0.70844984, "num_input_tokens_seen": 61637680, "step": 2844, "time_per_iteration": 2.5961523056030273 }, { "auxiliary_loss_clip": 0.01133396, "auxiliary_loss_mlp": 0.00778162, "balance_loss_clip": 1.05117011, "balance_loss_mlp": 1.00112617, "epoch": 0.17105065384037277, "flos": 19209336526080.0, "grad_norm": 2.027694794878894, "language_loss": 0.78771943, "learning_rate": 3.7949030719455646e-06, "loss": 0.806835, "num_input_tokens_seen": 61655630, "step": 2845, "time_per_iteration": 2.720193386077881 }, { "auxiliary_loss_clip": 0.01145033, "auxiliary_loss_mlp": 0.01047407, "balance_loss_clip": 1.05443549, "balance_loss_mlp": 1.02914453, "epoch": 0.17111077709304073, "flos": 18515183218560.0, "grad_norm": 2.2586144454646306, "language_loss": 0.7811147, "learning_rate": 3.7947312412772127e-06, "loss": 0.80303913, "num_input_tokens_seen": 61673475, "step": 2846, "time_per_iteration": 2.691033363342285 }, { "auxiliary_loss_clip": 0.01143809, "auxiliary_loss_mlp": 0.0104645, "balance_loss_clip": 1.05425262, "balance_loss_mlp": 1.02865243, "epoch": 0.1711709003457087, "flos": 25082670015360.0, "grad_norm": 2.2208975060456426, "language_loss": 0.79762948, "learning_rate": 3.794559342552472e-06, "loss": 0.8195321, "num_input_tokens_seen": 61693370, "step": 2847, "time_per_iteration": 2.7504522800445557 }, { "auxiliary_loss_clip": 0.01142651, "auxiliary_loss_mlp": 0.01045695, "balance_loss_clip": 1.05101562, "balance_loss_mlp": 1.02668071, "epoch": 0.17123102359837666, "flos": 17566387418880.0, "grad_norm": 2.4457083156230017, "language_loss": 0.8665086, "learning_rate": 3.7943873757778614e-06, "loss": 0.88839209, "num_input_tokens_seen": 61710820, "step": 2848, "time_per_iteration": 2.642946720123291 }, { "auxiliary_loss_clip": 0.0111167, "auxiliary_loss_mlp": 0.01044479, "balance_loss_clip": 1.04839015, "balance_loss_mlp": 1.02559662, "epoch": 0.17129114685104463, "flos": 26173635845760.0, "grad_norm": 3.6033710399461856, "language_loss": 0.75238276, "learning_rate": 3.794215340959902e-06, "loss": 0.77394426, "num_input_tokens_seen": 61729855, "step": 2849, "time_per_iteration": 2.7511017322540283 }, { "auxiliary_loss_clip": 0.0103263, "auxiliary_loss_mlp": 0.01006833, "balance_loss_clip": 1.02775574, "balance_loss_mlp": 1.00413883, "epoch": 0.17135127010371262, "flos": 69269710037760.0, "grad_norm": 0.7881928427119427, "language_loss": 0.57514679, "learning_rate": 3.7940432381051163e-06, "loss": 0.59554148, "num_input_tokens_seen": 61790290, "step": 2850, "time_per_iteration": 3.234609603881836 }, { "auxiliary_loss_clip": 0.01115021, "auxiliary_loss_mlp": 0.01044381, "balance_loss_clip": 1.05049884, "balance_loss_mlp": 1.02661848, "epoch": 0.1714113933563806, "flos": 23550110380800.0, "grad_norm": 2.962731712990184, "language_loss": 0.81328994, "learning_rate": 3.793871067220031e-06, "loss": 0.83488399, "num_input_tokens_seen": 61809265, "step": 2851, "time_per_iteration": 2.78957200050354 }, { "auxiliary_loss_clip": 0.01114419, "auxiliary_loss_mlp": 0.01043587, "balance_loss_clip": 1.05193233, "balance_loss_mlp": 1.02592039, "epoch": 0.17147151660904855, "flos": 21142443697920.0, "grad_norm": 2.049906502724323, "language_loss": 0.93085313, "learning_rate": 3.7936988283111764e-06, "loss": 0.95243311, "num_input_tokens_seen": 61828980, "step": 2852, "time_per_iteration": 2.8247029781341553 }, { "auxiliary_loss_clip": 0.01123258, "auxiliary_loss_mlp": 0.01048953, "balance_loss_clip": 1.04961288, "balance_loss_mlp": 1.03045225, "epoch": 0.17153163986171652, "flos": 18624890332800.0, "grad_norm": 1.8770741979814063, "language_loss": 0.69465554, "learning_rate": 3.7935265213850817e-06, "loss": 0.71637762, "num_input_tokens_seen": 61847915, "step": 2853, "time_per_iteration": 2.814162492752075 }, { "auxiliary_loss_clip": 0.01120856, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.05593121, "balance_loss_mlp": 1.02899122, "epoch": 0.17159176311438448, "flos": 18223265387520.0, "grad_norm": 2.5884803351111705, "language_loss": 0.66611075, "learning_rate": 3.7933541464482815e-06, "loss": 0.68778855, "num_input_tokens_seen": 61865570, "step": 2854, "time_per_iteration": 2.7968995571136475 }, { "auxiliary_loss_clip": 0.01120742, "auxiliary_loss_mlp": 0.01052217, "balance_loss_clip": 1.04853106, "balance_loss_mlp": 1.0349679, "epoch": 0.17165188636705245, "flos": 20738987159040.0, "grad_norm": 1.705510390491261, "language_loss": 0.8929621, "learning_rate": 3.7931817035073124e-06, "loss": 0.91469175, "num_input_tokens_seen": 61883340, "step": 2855, "time_per_iteration": 2.7045016288757324 }, { "auxiliary_loss_clip": 0.01157319, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.05505848, "balance_loss_mlp": 1.02662265, "epoch": 0.17171200961972044, "flos": 24899884680960.0, "grad_norm": 2.117219134143716, "language_loss": 0.83963835, "learning_rate": 3.7930091925687134e-06, "loss": 0.86164963, "num_input_tokens_seen": 61900610, "step": 2856, "time_per_iteration": 2.7349936962127686 }, { "auxiliary_loss_clip": 0.01150108, "auxiliary_loss_mlp": 0.0104615, "balance_loss_clip": 1.05812418, "balance_loss_mlp": 1.02783966, "epoch": 0.1717721328723884, "flos": 20157234485760.0, "grad_norm": 2.234025867710235, "language_loss": 0.86309886, "learning_rate": 3.792836613639026e-06, "loss": 0.88506144, "num_input_tokens_seen": 61916795, "step": 2857, "time_per_iteration": 2.749356746673584 }, { "auxiliary_loss_clip": 0.01144467, "auxiliary_loss_mlp": 0.0105057, "balance_loss_clip": 1.05469525, "balance_loss_mlp": 1.0324626, "epoch": 0.17183225612505637, "flos": 23361650697600.0, "grad_norm": 2.069122070501307, "language_loss": 0.78334701, "learning_rate": 3.7926639667247947e-06, "loss": 0.80529737, "num_input_tokens_seen": 61936665, "step": 2858, "time_per_iteration": 2.6673583984375 }, { "auxiliary_loss_clip": 0.01147374, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.05591416, "balance_loss_mlp": 1.03263378, "epoch": 0.17189237937772434, "flos": 18114240631680.0, "grad_norm": 2.1629422323642453, "language_loss": 0.77565676, "learning_rate": 3.7924912518325663e-06, "loss": 0.79766762, "num_input_tokens_seen": 61954415, "step": 2859, "time_per_iteration": 2.646648645401001 }, { "auxiliary_loss_clip": 0.0110879, "auxiliary_loss_mlp": 0.01047481, "balance_loss_clip": 1.05317724, "balance_loss_mlp": 1.02887201, "epoch": 0.1719525026303923, "flos": 23258408031360.0, "grad_norm": 2.088627069497316, "language_loss": 0.77088714, "learning_rate": 3.7923184689688902e-06, "loss": 0.79244983, "num_input_tokens_seen": 61973940, "step": 2860, "time_per_iteration": 2.7671573162078857 }, { "auxiliary_loss_clip": 0.01145562, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.05316472, "balance_loss_mlp": 1.02416611, "epoch": 0.17201262588306027, "flos": 20810413353600.0, "grad_norm": 2.1608688480628304, "language_loss": 0.81384242, "learning_rate": 3.792145618140317e-06, "loss": 0.83571851, "num_input_tokens_seen": 61991845, "step": 2861, "time_per_iteration": 2.6492061614990234 }, { "auxiliary_loss_clip": 0.011306, "auxiliary_loss_mlp": 0.01051558, "balance_loss_clip": 1.05280077, "balance_loss_mlp": 1.0335927, "epoch": 0.17207274913572823, "flos": 20375858615040.0, "grad_norm": 2.0128324416816192, "language_loss": 0.85691392, "learning_rate": 3.7919726993534038e-06, "loss": 0.87873554, "num_input_tokens_seen": 62009395, "step": 2862, "time_per_iteration": 4.290126323699951 }, { "auxiliary_loss_clip": 0.01116765, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.05126834, "balance_loss_mlp": 1.02655208, "epoch": 0.17213287238839622, "flos": 26797727675520.0, "grad_norm": 3.7047120479299993, "language_loss": 0.78047049, "learning_rate": 3.7917997126147054e-06, "loss": 0.80207253, "num_input_tokens_seen": 62029005, "step": 2863, "time_per_iteration": 4.275500774383545 }, { "auxiliary_loss_clip": 0.01122315, "auxiliary_loss_mlp": 0.00776596, "balance_loss_clip": 1.05132961, "balance_loss_mlp": 1.00090909, "epoch": 0.1721929956410642, "flos": 26030819370240.0, "grad_norm": 1.7350128683820358, "language_loss": 0.72135127, "learning_rate": 3.7916266579307823e-06, "loss": 0.74034035, "num_input_tokens_seen": 62048730, "step": 2864, "time_per_iteration": 4.414710998535156 }, { "auxiliary_loss_clip": 0.01121488, "auxiliary_loss_mlp": 0.01049611, "balance_loss_clip": 1.05114079, "balance_loss_mlp": 1.03099, "epoch": 0.17225311889373215, "flos": 22273091078400.0, "grad_norm": 1.9270646210248614, "language_loss": 0.73002023, "learning_rate": 3.7914535353081973e-06, "loss": 0.75173128, "num_input_tokens_seen": 62069000, "step": 2865, "time_per_iteration": 2.7463715076446533 }, { "auxiliary_loss_clip": 0.01145037, "auxiliary_loss_mlp": 0.0077644, "balance_loss_clip": 1.05669165, "balance_loss_mlp": 1.00120521, "epoch": 0.17231324214640012, "flos": 21287774125440.0, "grad_norm": 2.669585642962841, "language_loss": 0.78357804, "learning_rate": 3.7912803447535145e-06, "loss": 0.80279285, "num_input_tokens_seen": 62086750, "step": 2866, "time_per_iteration": 2.785146713256836 }, { "auxiliary_loss_clip": 0.01157272, "auxiliary_loss_mlp": 0.01044358, "balance_loss_clip": 1.05600274, "balance_loss_mlp": 1.02536821, "epoch": 0.17237336539906808, "flos": 19680735640320.0, "grad_norm": 2.551277931358127, "language_loss": 0.79755104, "learning_rate": 3.7911070862733016e-06, "loss": 0.81956732, "num_input_tokens_seen": 62106240, "step": 2867, "time_per_iteration": 4.3145318031311035 }, { "auxiliary_loss_clip": 0.01132297, "auxiliary_loss_mlp": 0.01041396, "balance_loss_clip": 1.0529356, "balance_loss_mlp": 1.02274013, "epoch": 0.17243348865173605, "flos": 17529650784000.0, "grad_norm": 1.8689780270661371, "language_loss": 0.79206991, "learning_rate": 3.7909337598741276e-06, "loss": 0.81380683, "num_input_tokens_seen": 62124895, "step": 2868, "time_per_iteration": 2.7683827877044678 }, { "auxiliary_loss_clip": 0.01111702, "auxiliary_loss_mlp": 0.01041717, "balance_loss_clip": 1.05331647, "balance_loss_mlp": 1.02427697, "epoch": 0.17249361190440402, "flos": 18259858368000.0, "grad_norm": 2.0344588273772923, "language_loss": 0.84221756, "learning_rate": 3.7907603655625674e-06, "loss": 0.86375177, "num_input_tokens_seen": 62143510, "step": 2869, "time_per_iteration": 2.729156970977783 }, { "auxiliary_loss_clip": 0.01132999, "auxiliary_loss_mlp": 0.01048405, "balance_loss_clip": 1.0535363, "balance_loss_mlp": 1.02955842, "epoch": 0.172553735157072, "flos": 21174367910400.0, "grad_norm": 1.8935704627114847, "language_loss": 0.77299273, "learning_rate": 3.7905869033451932e-06, "loss": 0.79480684, "num_input_tokens_seen": 62162285, "step": 2870, "time_per_iteration": 2.752739191055298 }, { "auxiliary_loss_clip": 0.0115398, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.05671024, "balance_loss_mlp": 1.02110744, "epoch": 0.17261385840973997, "flos": 22273270646400.0, "grad_norm": 2.0115587398764396, "language_loss": 0.77409238, "learning_rate": 3.7904133732285857e-06, "loss": 0.79601026, "num_input_tokens_seen": 62180970, "step": 2871, "time_per_iteration": 2.660627603530884 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.05313993, "balance_loss_mlp": 1.0222379, "epoch": 0.17267398166240794, "flos": 27922233830400.0, "grad_norm": 2.203011669690562, "language_loss": 0.74197829, "learning_rate": 3.7902397752193228e-06, "loss": 0.76371384, "num_input_tokens_seen": 62198965, "step": 2872, "time_per_iteration": 2.6959900856018066 }, { "auxiliary_loss_clip": 0.01150773, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.05359554, "balance_loss_mlp": 1.02362645, "epoch": 0.1727341049150759, "flos": 21945118970880.0, "grad_norm": 1.7914171074077658, "language_loss": 0.82336062, "learning_rate": 3.790066109323988e-06, "loss": 0.84528345, "num_input_tokens_seen": 62219890, "step": 2873, "time_per_iteration": 2.603564977645874 }, { "auxiliary_loss_clip": 0.01108819, "auxiliary_loss_mlp": 0.01044995, "balance_loss_clip": 1.04744792, "balance_loss_mlp": 1.02522969, "epoch": 0.17279422816774387, "flos": 18107883924480.0, "grad_norm": 3.7341652608759297, "language_loss": 0.75355422, "learning_rate": 3.7898923755491678e-06, "loss": 0.77509236, "num_input_tokens_seen": 62237140, "step": 2874, "time_per_iteration": 2.8438260555267334 }, { "auxiliary_loss_clip": 0.01159322, "auxiliary_loss_mlp": 0.01044415, "balance_loss_clip": 1.05658269, "balance_loss_mlp": 1.02404249, "epoch": 0.17285435142041183, "flos": 21835447770240.0, "grad_norm": 2.7053876793207037, "language_loss": 0.80239916, "learning_rate": 3.7897185739014487e-06, "loss": 0.82443655, "num_input_tokens_seen": 62255405, "step": 2875, "time_per_iteration": 2.625183343887329 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.0105273, "balance_loss_clip": 1.0535475, "balance_loss_mlp": 1.03297722, "epoch": 0.17291447467307983, "flos": 18368452160640.0, "grad_norm": 3.840653645811056, "language_loss": 0.87621164, "learning_rate": 3.7895447043874217e-06, "loss": 0.8980962, "num_input_tokens_seen": 62271280, "step": 2876, "time_per_iteration": 2.6782751083374023 }, { "auxiliary_loss_clip": 0.01136898, "auxiliary_loss_mlp": 0.01044228, "balance_loss_clip": 1.05730534, "balance_loss_mlp": 1.02559566, "epoch": 0.1729745979257478, "flos": 18624638937600.0, "grad_norm": 1.8931416121171032, "language_loss": 0.84386718, "learning_rate": 3.789370767013681e-06, "loss": 0.86567843, "num_input_tokens_seen": 62289140, "step": 2877, "time_per_iteration": 2.681131362915039 }, { "auxiliary_loss_clip": 0.01120759, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.05222571, "balance_loss_mlp": 1.02499604, "epoch": 0.17303472117841576, "flos": 22998234844800.0, "grad_norm": 2.106635210245156, "language_loss": 0.79660022, "learning_rate": 3.7891967617868204e-06, "loss": 0.81824744, "num_input_tokens_seen": 62307490, "step": 2878, "time_per_iteration": 2.8118834495544434 }, { "auxiliary_loss_clip": 0.01136112, "auxiliary_loss_mlp": 0.01047222, "balance_loss_clip": 1.05593777, "balance_loss_mlp": 1.02953172, "epoch": 0.17309484443108372, "flos": 25664386775040.0, "grad_norm": 1.9675557254753375, "language_loss": 0.70236337, "learning_rate": 3.78902268871344e-06, "loss": 0.72419673, "num_input_tokens_seen": 62328570, "step": 2879, "time_per_iteration": 2.7998502254486084 }, { "auxiliary_loss_clip": 0.01130517, "auxiliary_loss_mlp": 0.01051722, "balance_loss_clip": 1.05183411, "balance_loss_mlp": 1.03337598, "epoch": 0.1731549676837517, "flos": 13552903313280.0, "grad_norm": 2.0545155253910163, "language_loss": 0.82884222, "learning_rate": 3.78884854780014e-06, "loss": 0.85066462, "num_input_tokens_seen": 62345735, "step": 2880, "time_per_iteration": 2.6707684993743896 }, { "auxiliary_loss_clip": 0.01110706, "auxiliary_loss_mlp": 0.01054327, "balance_loss_clip": 1.05214918, "balance_loss_mlp": 1.03303647, "epoch": 0.17321509093641965, "flos": 22857070394880.0, "grad_norm": 1.9029231217608267, "language_loss": 0.80879176, "learning_rate": 3.7886743390535236e-06, "loss": 0.83044201, "num_input_tokens_seen": 62365525, "step": 2881, "time_per_iteration": 2.7851576805114746 }, { "auxiliary_loss_clip": 0.01135983, "auxiliary_loss_mlp": 0.01046895, "balance_loss_clip": 1.05544055, "balance_loss_mlp": 1.02921653, "epoch": 0.17327521418908762, "flos": 24352785653760.0, "grad_norm": 2.753231520615002, "language_loss": 0.77268815, "learning_rate": 3.788500062480197e-06, "loss": 0.79451692, "num_input_tokens_seen": 62385160, "step": 2882, "time_per_iteration": 2.7785212993621826 }, { "auxiliary_loss_clip": 0.01124099, "auxiliary_loss_mlp": 0.01047516, "balance_loss_clip": 1.0633558, "balance_loss_mlp": 1.02947998, "epoch": 0.1733353374417556, "flos": 33105651816960.0, "grad_norm": 2.096311926604511, "language_loss": 0.76714236, "learning_rate": 3.788325718086769e-06, "loss": 0.78885853, "num_input_tokens_seen": 62405280, "step": 2883, "time_per_iteration": 2.838848352432251 }, { "auxiliary_loss_clip": 0.01110924, "auxiliary_loss_mlp": 0.0104619, "balance_loss_clip": 1.04929209, "balance_loss_mlp": 1.02821302, "epoch": 0.17339546069442358, "flos": 24388947671040.0, "grad_norm": 2.1194201700326873, "language_loss": 0.8555252, "learning_rate": 3.7881513058798503e-06, "loss": 0.87709635, "num_input_tokens_seen": 62423665, "step": 2884, "time_per_iteration": 2.829376220703125 }, { "auxiliary_loss_clip": 0.01133962, "auxiliary_loss_mlp": 0.00775817, "balance_loss_clip": 1.05472779, "balance_loss_mlp": 1.00088096, "epoch": 0.17345558394709154, "flos": 27454174680960.0, "grad_norm": 1.7131036779262108, "language_loss": 0.74756771, "learning_rate": 3.787976825866055e-06, "loss": 0.76666546, "num_input_tokens_seen": 62445170, "step": 2885, "time_per_iteration": 2.8710989952087402 }, { "auxiliary_loss_clip": 0.01128977, "auxiliary_loss_mlp": 0.01044901, "balance_loss_clip": 1.05498922, "balance_loss_mlp": 1.0280925, "epoch": 0.1735157071997595, "flos": 24682158391680.0, "grad_norm": 2.374438581614022, "language_loss": 0.7107017, "learning_rate": 3.7878022780519998e-06, "loss": 0.73244053, "num_input_tokens_seen": 62466135, "step": 2886, "time_per_iteration": 2.726621150970459 }, { "auxiliary_loss_clip": 0.01142411, "auxiliary_loss_mlp": 0.01041857, "balance_loss_clip": 1.05233932, "balance_loss_mlp": 1.02408338, "epoch": 0.17357583045242747, "flos": 21688932193920.0, "grad_norm": 2.0566537172661747, "language_loss": 0.69906294, "learning_rate": 3.7876276624443024e-06, "loss": 0.72090566, "num_input_tokens_seen": 62483910, "step": 2887, "time_per_iteration": 2.7066688537597656 }, { "auxiliary_loss_clip": 0.01116425, "auxiliary_loss_mlp": 0.01045383, "balance_loss_clip": 1.05328536, "balance_loss_mlp": 1.02728677, "epoch": 0.17363595370509544, "flos": 15375728753280.0, "grad_norm": 2.038016964464323, "language_loss": 0.85257947, "learning_rate": 3.787452979049585e-06, "loss": 0.87419748, "num_input_tokens_seen": 62501530, "step": 2888, "time_per_iteration": 2.7514970302581787 }, { "auxiliary_loss_clip": 0.01095063, "auxiliary_loss_mlp": 0.01049413, "balance_loss_clip": 1.05020595, "balance_loss_mlp": 1.02822983, "epoch": 0.1736960769577634, "flos": 23440941970560.0, "grad_norm": 2.196318077733749, "language_loss": 0.78491282, "learning_rate": 3.7872782278744718e-06, "loss": 0.80635762, "num_input_tokens_seen": 62521295, "step": 2889, "time_per_iteration": 2.8221559524536133 }, { "auxiliary_loss_clip": 0.01112139, "auxiliary_loss_mlp": 0.0077601, "balance_loss_clip": 1.05236733, "balance_loss_mlp": 1.00114667, "epoch": 0.1737562002104314, "flos": 18587830475520.0, "grad_norm": 2.333227367674716, "language_loss": 0.84076989, "learning_rate": 3.7871034089255883e-06, "loss": 0.85965133, "num_input_tokens_seen": 62539615, "step": 2890, "time_per_iteration": 2.7213382720947266 }, { "auxiliary_loss_clip": 0.01142218, "auxiliary_loss_mlp": 0.01054918, "balance_loss_clip": 1.05530691, "balance_loss_mlp": 1.03752589, "epoch": 0.17381632346309936, "flos": 15998060816640.0, "grad_norm": 2.7278091568285596, "language_loss": 0.82205319, "learning_rate": 3.7869285222095653e-06, "loss": 0.84402454, "num_input_tokens_seen": 62556820, "step": 2891, "time_per_iteration": 2.625162363052368 }, { "auxiliary_loss_clip": 0.01097361, "auxiliary_loss_mlp": 0.01050012, "balance_loss_clip": 1.04281187, "balance_loss_mlp": 1.02876878, "epoch": 0.17387644671576732, "flos": 13369830670080.0, "grad_norm": 1.9017653264876209, "language_loss": 0.81200826, "learning_rate": 3.7867535677330334e-06, "loss": 0.83348203, "num_input_tokens_seen": 62572450, "step": 2892, "time_per_iteration": 2.7682459354400635 }, { "auxiliary_loss_clip": 0.01148834, "auxiliary_loss_mlp": 0.0105551, "balance_loss_clip": 1.05707812, "balance_loss_mlp": 1.03631687, "epoch": 0.1739365699684353, "flos": 26615516958720.0, "grad_norm": 2.0056711213447436, "language_loss": 0.73950225, "learning_rate": 3.786578545502627e-06, "loss": 0.76154572, "num_input_tokens_seen": 62592580, "step": 2893, "time_per_iteration": 2.8463022708892822 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01043509, "balance_loss_clip": 1.05198765, "balance_loss_mlp": 1.02443516, "epoch": 0.17399669322110325, "flos": 23367971491200.0, "grad_norm": 4.010773627073901, "language_loss": 0.82507658, "learning_rate": 3.7864034555249828e-06, "loss": 0.84683645, "num_input_tokens_seen": 62611220, "step": 2894, "time_per_iteration": 2.719564914703369 }, { "auxiliary_loss_clip": 0.01113951, "auxiliary_loss_mlp": 0.01046249, "balance_loss_clip": 1.0506922, "balance_loss_mlp": 1.02463603, "epoch": 0.17405681647377122, "flos": 22054107813120.0, "grad_norm": 2.3322053123967574, "language_loss": 0.73826683, "learning_rate": 3.786228297806741e-06, "loss": 0.7598688, "num_input_tokens_seen": 62629185, "step": 2895, "time_per_iteration": 2.743992805480957 }, { "auxiliary_loss_clip": 0.01037578, "auxiliary_loss_mlp": 0.01011099, "balance_loss_clip": 1.0404408, "balance_loss_mlp": 1.00788069, "epoch": 0.1741169397264392, "flos": 61457559114240.0, "grad_norm": 0.8765647158253519, "language_loss": 0.62754023, "learning_rate": 3.7860530723545435e-06, "loss": 0.64802706, "num_input_tokens_seen": 62691895, "step": 2896, "time_per_iteration": 3.345099687576294 }, { "auxiliary_loss_clip": 0.0113101, "auxiliary_loss_mlp": 0.00776588, "balance_loss_clip": 1.05246758, "balance_loss_mlp": 1.00102258, "epoch": 0.17417706297910718, "flos": 27017680608000.0, "grad_norm": 1.7338863964520728, "language_loss": 0.75822324, "learning_rate": 3.785877779175034e-06, "loss": 0.77729923, "num_input_tokens_seen": 62713790, "step": 2897, "time_per_iteration": 2.772292137145996 }, { "auxiliary_loss_clip": 0.01141357, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.0545547, "balance_loss_mlp": 1.02512598, "epoch": 0.17423718623177514, "flos": 33508856960640.0, "grad_norm": 1.944569306659421, "language_loss": 0.6883949, "learning_rate": 3.7857024182748606e-06, "loss": 0.71023834, "num_input_tokens_seen": 62736285, "step": 2898, "time_per_iteration": 2.7278554439544678 }, { "auxiliary_loss_clip": 0.01128715, "auxiliary_loss_mlp": 0.01044216, "balance_loss_clip": 1.05251193, "balance_loss_mlp": 1.02504694, "epoch": 0.1742973094844431, "flos": 27198634348800.0, "grad_norm": 2.99011081330885, "language_loss": 0.76445562, "learning_rate": 3.7855269896606717e-06, "loss": 0.78618491, "num_input_tokens_seen": 62756240, "step": 2899, "time_per_iteration": 2.8052010536193848 }, { "auxiliary_loss_clip": 0.01095069, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.04680347, "balance_loss_mlp": 1.02632213, "epoch": 0.17435743273711107, "flos": 22710734386560.0, "grad_norm": 3.2965812335226357, "language_loss": 0.72860038, "learning_rate": 3.785351493339121e-06, "loss": 0.75001007, "num_input_tokens_seen": 62775910, "step": 2900, "time_per_iteration": 2.868218421936035 }, { "auxiliary_loss_clip": 0.01110522, "auxiliary_loss_mlp": 0.00776698, "balance_loss_clip": 1.05202782, "balance_loss_mlp": 1.000983, "epoch": 0.17441755598977904, "flos": 41646466039680.0, "grad_norm": 1.5488662608930523, "language_loss": 0.69946706, "learning_rate": 3.785175929316863e-06, "loss": 0.71833932, "num_input_tokens_seen": 62799385, "step": 2901, "time_per_iteration": 4.407040596008301 }, { "auxiliary_loss_clip": 0.01129098, "auxiliary_loss_mlp": 0.01045525, "balance_loss_clip": 1.05246592, "balance_loss_mlp": 1.02764344, "epoch": 0.174477679242447, "flos": 26287077974400.0, "grad_norm": 2.1785959913748965, "language_loss": 0.76588804, "learning_rate": 3.7850002976005543e-06, "loss": 0.78763425, "num_input_tokens_seen": 62819380, "step": 2902, "time_per_iteration": 4.2244462966918945 }, { "auxiliary_loss_clip": 0.01145685, "auxiliary_loss_mlp": 0.0104382, "balance_loss_clip": 1.0531354, "balance_loss_mlp": 1.02567625, "epoch": 0.174537802495115, "flos": 17858412990720.0, "grad_norm": 2.2508699895191073, "language_loss": 0.81588745, "learning_rate": 3.7848245981968558e-06, "loss": 0.83778256, "num_input_tokens_seen": 62836205, "step": 2903, "time_per_iteration": 4.132925271987915 }, { "auxiliary_loss_clip": 0.01126443, "auxiliary_loss_mlp": 0.0103942, "balance_loss_clip": 1.05449986, "balance_loss_mlp": 1.02135992, "epoch": 0.17459792574778296, "flos": 16940715390720.0, "grad_norm": 2.4085694554154187, "language_loss": 0.73316491, "learning_rate": 3.784648831112429e-06, "loss": 0.75482351, "num_input_tokens_seen": 62854045, "step": 2904, "time_per_iteration": 2.7033374309539795 }, { "auxiliary_loss_clip": 0.01105192, "auxiliary_loss_mlp": 0.0104577, "balance_loss_clip": 1.05250716, "balance_loss_mlp": 1.02822256, "epoch": 0.17465804900045093, "flos": 25520026014720.0, "grad_norm": 1.8783326609306377, "language_loss": 0.64233291, "learning_rate": 3.7844729963539406e-06, "loss": 0.66384256, "num_input_tokens_seen": 62873075, "step": 2905, "time_per_iteration": 2.8325791358947754 }, { "auxiliary_loss_clip": 0.01135256, "auxiliary_loss_mlp": 0.01053006, "balance_loss_clip": 1.05869055, "balance_loss_mlp": 1.03370619, "epoch": 0.1747181722531189, "flos": 24129708238080.0, "grad_norm": 2.820817719352069, "language_loss": 0.79504299, "learning_rate": 3.7842970939280566e-06, "loss": 0.81692564, "num_input_tokens_seen": 62892675, "step": 2906, "time_per_iteration": 4.491498231887817 }, { "auxiliary_loss_clip": 0.01146195, "auxiliary_loss_mlp": 0.01050729, "balance_loss_clip": 1.05623174, "balance_loss_mlp": 1.03258538, "epoch": 0.17477829550578686, "flos": 17748813617280.0, "grad_norm": 2.262709441571415, "language_loss": 0.81318873, "learning_rate": 3.784121123841449e-06, "loss": 0.83515799, "num_input_tokens_seen": 62910675, "step": 2907, "time_per_iteration": 2.6855854988098145 }, { "auxiliary_loss_clip": 0.01143202, "auxiliary_loss_mlp": 0.01043315, "balance_loss_clip": 1.05374384, "balance_loss_mlp": 1.0253861, "epoch": 0.17483841875845482, "flos": 15377344865280.0, "grad_norm": 2.068635027461873, "language_loss": 0.81342787, "learning_rate": 3.7839450861007886e-06, "loss": 0.83529305, "num_input_tokens_seen": 62928130, "step": 2908, "time_per_iteration": 2.6449570655822754 }, { "auxiliary_loss_clip": 0.01127136, "auxiliary_loss_mlp": 0.01050925, "balance_loss_clip": 1.05178046, "balance_loss_mlp": 1.03163743, "epoch": 0.17489854201112282, "flos": 17163254102400.0, "grad_norm": 3.147433356867123, "language_loss": 0.80020624, "learning_rate": 3.7837689807127518e-06, "loss": 0.82198691, "num_input_tokens_seen": 62944290, "step": 2909, "time_per_iteration": 2.6820569038391113 }, { "auxiliary_loss_clip": 0.0109059, "auxiliary_loss_mlp": 0.01052625, "balance_loss_clip": 1.05020881, "balance_loss_mlp": 1.0310595, "epoch": 0.17495866526379078, "flos": 19755286318080.0, "grad_norm": 1.6978440546881337, "language_loss": 0.76742244, "learning_rate": 3.783592807684017e-06, "loss": 0.7888546, "num_input_tokens_seen": 62963505, "step": 2910, "time_per_iteration": 2.6980416774749756 }, { "auxiliary_loss_clip": 0.01158552, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.05618358, "balance_loss_mlp": 1.03059566, "epoch": 0.17501878851645875, "flos": 28511133310080.0, "grad_norm": 1.9812610358315632, "language_loss": 0.8698765, "learning_rate": 3.7834165670212645e-06, "loss": 0.89195609, "num_input_tokens_seen": 62985020, "step": 2911, "time_per_iteration": 2.692662477493286 }, { "auxiliary_loss_clip": 0.01154744, "auxiliary_loss_mlp": 0.00777232, "balance_loss_clip": 1.05323184, "balance_loss_mlp": 1.00110698, "epoch": 0.1750789117691267, "flos": 17931203902080.0, "grad_norm": 3.030740090796483, "language_loss": 0.89883876, "learning_rate": 3.7832402587311764e-06, "loss": 0.91815847, "num_input_tokens_seen": 63001745, "step": 2912, "time_per_iteration": 2.600738763809204 }, { "auxiliary_loss_clip": 0.01146165, "auxiliary_loss_mlp": 0.01045616, "balance_loss_clip": 1.0538094, "balance_loss_mlp": 1.02655411, "epoch": 0.17513903502179468, "flos": 18259427404800.0, "grad_norm": 2.03479884577424, "language_loss": 0.72818935, "learning_rate": 3.783063882820439e-06, "loss": 0.75010711, "num_input_tokens_seen": 63019750, "step": 2913, "time_per_iteration": 2.623342275619507 }, { "auxiliary_loss_clip": 0.01140074, "auxiliary_loss_mlp": 0.01043928, "balance_loss_clip": 1.05781865, "balance_loss_mlp": 1.02557003, "epoch": 0.17519915827446264, "flos": 20704728562560.0, "grad_norm": 2.137073079496124, "language_loss": 0.6891731, "learning_rate": 3.782887439295741e-06, "loss": 0.71101314, "num_input_tokens_seen": 63039500, "step": 2914, "time_per_iteration": 2.7065770626068115 }, { "auxiliary_loss_clip": 0.01142434, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.05532789, "balance_loss_mlp": 1.02649403, "epoch": 0.1752592815271306, "flos": 20523415685760.0, "grad_norm": 2.051329837479214, "language_loss": 0.93125081, "learning_rate": 3.782710928163772e-06, "loss": 0.9531256, "num_input_tokens_seen": 63059785, "step": 2915, "time_per_iteration": 2.659029245376587 }, { "auxiliary_loss_clip": 0.01114731, "auxiliary_loss_mlp": 0.01040999, "balance_loss_clip": 1.04957223, "balance_loss_mlp": 1.02243853, "epoch": 0.1753194047797986, "flos": 21799178012160.0, "grad_norm": 1.604344576738792, "language_loss": 0.81092978, "learning_rate": 3.782534349431226e-06, "loss": 0.83248705, "num_input_tokens_seen": 63079385, "step": 2916, "time_per_iteration": 2.7099549770355225 }, { "auxiliary_loss_clip": 0.0114211, "auxiliary_loss_mlp": 0.01046221, "balance_loss_clip": 1.05090034, "balance_loss_mlp": 1.02780342, "epoch": 0.17537952803246656, "flos": 20668351063680.0, "grad_norm": 3.7582760939418716, "language_loss": 0.73829222, "learning_rate": 3.782357703104799e-06, "loss": 0.76017547, "num_input_tokens_seen": 63098970, "step": 2917, "time_per_iteration": 2.666717767715454 }, { "auxiliary_loss_clip": 0.01133449, "auxiliary_loss_mlp": 0.01047353, "balance_loss_clip": 1.05319786, "balance_loss_mlp": 1.02821994, "epoch": 0.17543965128513453, "flos": 23295072839040.0, "grad_norm": 1.813699779869167, "language_loss": 0.76739681, "learning_rate": 3.7821809891911897e-06, "loss": 0.78920484, "num_input_tokens_seen": 63118750, "step": 2918, "time_per_iteration": 2.647634744644165 }, { "auxiliary_loss_clip": 0.01093958, "auxiliary_loss_mlp": 0.01045643, "balance_loss_clip": 1.0476644, "balance_loss_mlp": 1.02425694, "epoch": 0.1754997745378025, "flos": 29095615416960.0, "grad_norm": 2.436739755969174, "language_loss": 0.73624814, "learning_rate": 3.782004207697098e-06, "loss": 0.75764406, "num_input_tokens_seen": 63136865, "step": 2919, "time_per_iteration": 2.7904632091522217 }, { "auxiliary_loss_clip": 0.0112465, "auxiliary_loss_mlp": 0.01046524, "balance_loss_clip": 1.04938293, "balance_loss_mlp": 1.02805829, "epoch": 0.17555989779047046, "flos": 30371844620160.0, "grad_norm": 2.5113730227003814, "language_loss": 0.74840331, "learning_rate": 3.781827358629228e-06, "loss": 0.77011508, "num_input_tokens_seen": 63158325, "step": 2920, "time_per_iteration": 2.727890968322754 }, { "auxiliary_loss_clip": 0.01117257, "auxiliary_loss_mlp": 0.01042893, "balance_loss_clip": 1.0462867, "balance_loss_mlp": 1.02371216, "epoch": 0.17562002104313842, "flos": 23287746464640.0, "grad_norm": 3.6617213109535536, "language_loss": 0.79731411, "learning_rate": 3.7816504419942873e-06, "loss": 0.81891561, "num_input_tokens_seen": 63173115, "step": 2921, "time_per_iteration": 2.753817558288574 }, { "auxiliary_loss_clip": 0.01121718, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.05232286, "balance_loss_mlp": 1.02679133, "epoch": 0.1756801442958064, "flos": 24790500789120.0, "grad_norm": 2.6301689129577546, "language_loss": 0.87826073, "learning_rate": 3.7814734577989823e-06, "loss": 0.89994025, "num_input_tokens_seen": 63192880, "step": 2922, "time_per_iteration": 2.7411837577819824 }, { "auxiliary_loss_clip": 0.01144004, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.05196273, "balance_loss_mlp": 1.02778149, "epoch": 0.17574026754847438, "flos": 25771651764480.0, "grad_norm": 4.4893841411537085, "language_loss": 0.62347209, "learning_rate": 3.7812964060500253e-06, "loss": 0.64538622, "num_input_tokens_seen": 63214395, "step": 2923, "time_per_iteration": 2.7666683197021484 }, { "auxiliary_loss_clip": 0.01134872, "auxiliary_loss_mlp": 0.01048692, "balance_loss_clip": 1.05887377, "balance_loss_mlp": 1.02847457, "epoch": 0.17580039080114235, "flos": 17456608477440.0, "grad_norm": 2.8552131957437914, "language_loss": 0.80392253, "learning_rate": 3.78111928675413e-06, "loss": 0.82575822, "num_input_tokens_seen": 63231020, "step": 2924, "time_per_iteration": 2.729403257369995 }, { "auxiliary_loss_clip": 0.01132783, "auxiliary_loss_mlp": 0.01051456, "balance_loss_clip": 1.05193377, "balance_loss_mlp": 1.03082108, "epoch": 0.1758605140538103, "flos": 14864648088960.0, "grad_norm": 5.080042666316876, "language_loss": 0.71374178, "learning_rate": 3.7809420999180126e-06, "loss": 0.73558426, "num_input_tokens_seen": 63246245, "step": 2925, "time_per_iteration": 2.9538233280181885 }, { "auxiliary_loss_clip": 0.01117196, "auxiliary_loss_mlp": 0.01045706, "balance_loss_clip": 1.05052948, "balance_loss_mlp": 1.02744341, "epoch": 0.17592063730647828, "flos": 23004268329600.0, "grad_norm": 1.6620026542608322, "language_loss": 0.71931666, "learning_rate": 3.7807648455483934e-06, "loss": 0.74094564, "num_input_tokens_seen": 63267790, "step": 2926, "time_per_iteration": 2.7738964557647705 }, { "auxiliary_loss_clip": 0.01105944, "auxiliary_loss_mlp": 0.01045732, "balance_loss_clip": 1.04915071, "balance_loss_mlp": 1.02253425, "epoch": 0.17598076055914624, "flos": 20741501111040.0, "grad_norm": 2.6318732447225837, "language_loss": 0.84724289, "learning_rate": 3.7805875236519918e-06, "loss": 0.86875963, "num_input_tokens_seen": 63286830, "step": 2927, "time_per_iteration": 2.704437494277954 }, { "auxiliary_loss_clip": 0.01100437, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.05039644, "balance_loss_mlp": 1.02887452, "epoch": 0.1760408838118142, "flos": 34092441227520.0, "grad_norm": 1.9547597089289632, "language_loss": 0.72147644, "learning_rate": 3.7804101342355336e-06, "loss": 0.74294758, "num_input_tokens_seen": 63308870, "step": 2928, "time_per_iteration": 2.793802261352539 }, { "auxiliary_loss_clip": 0.01120251, "auxiliary_loss_mlp": 0.01045623, "balance_loss_clip": 1.0516876, "balance_loss_mlp": 1.02679992, "epoch": 0.1761010070644822, "flos": 24168384207360.0, "grad_norm": 1.8474008440192304, "language_loss": 0.83097279, "learning_rate": 3.780232677305744e-06, "loss": 0.85263157, "num_input_tokens_seen": 63329005, "step": 2929, "time_per_iteration": 2.733339786529541 }, { "auxiliary_loss_clip": 0.01124127, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.04853475, "balance_loss_mlp": 1.02479422, "epoch": 0.17616113031715017, "flos": 26576697335040.0, "grad_norm": 2.4427170552109163, "language_loss": 0.79211783, "learning_rate": 3.7800551528693535e-06, "loss": 0.81378424, "num_input_tokens_seen": 63349390, "step": 2930, "time_per_iteration": 2.748080015182495 }, { "auxiliary_loss_clip": 0.01160654, "auxiliary_loss_mlp": 0.01047281, "balance_loss_clip": 1.05925918, "balance_loss_mlp": 1.02758813, "epoch": 0.17622125356981813, "flos": 25666685245440.0, "grad_norm": 2.504124366499191, "language_loss": 0.76502466, "learning_rate": 3.7798775609330927e-06, "loss": 0.78710401, "num_input_tokens_seen": 63368835, "step": 2931, "time_per_iteration": 2.6691603660583496 }, { "auxiliary_loss_clip": 0.01076453, "auxiliary_loss_mlp": 0.01043586, "balance_loss_clip": 1.04577017, "balance_loss_mlp": 1.02478647, "epoch": 0.1762813768224861, "flos": 16508530949760.0, "grad_norm": 2.941321746162514, "language_loss": 0.76070881, "learning_rate": 3.779699901503696e-06, "loss": 0.78190923, "num_input_tokens_seen": 63385220, "step": 2932, "time_per_iteration": 2.809630870819092 }, { "auxiliary_loss_clip": 0.01148627, "auxiliary_loss_mlp": 0.01043149, "balance_loss_clip": 1.05284405, "balance_loss_mlp": 1.0229789, "epoch": 0.17634150007515406, "flos": 11211850402560.0, "grad_norm": 5.168612276821382, "language_loss": 0.90027422, "learning_rate": 3.7795221745879016e-06, "loss": 0.92219198, "num_input_tokens_seen": 63400865, "step": 2933, "time_per_iteration": 2.6665337085723877 }, { "auxiliary_loss_clip": 0.01154114, "auxiliary_loss_mlp": 0.01055985, "balance_loss_clip": 1.05539656, "balance_loss_mlp": 1.03766203, "epoch": 0.17640162332782203, "flos": 23659925235840.0, "grad_norm": 2.009210784374188, "language_loss": 0.88323247, "learning_rate": 3.779344380192448e-06, "loss": 0.90533352, "num_input_tokens_seen": 63421390, "step": 2934, "time_per_iteration": 2.6649580001831055 }, { "auxiliary_loss_clip": 0.01128495, "auxiliary_loss_mlp": 0.01048067, "balance_loss_clip": 1.05581188, "balance_loss_mlp": 1.03028131, "epoch": 0.17646174658049, "flos": 53796984606720.0, "grad_norm": 1.6302121247923247, "language_loss": 0.70403945, "learning_rate": 3.779166518324077e-06, "loss": 0.72580504, "num_input_tokens_seen": 63444715, "step": 2935, "time_per_iteration": 3.006019115447998 }, { "auxiliary_loss_clip": 0.01126189, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.05360174, "balance_loss_mlp": 1.02135396, "epoch": 0.17652186983315798, "flos": 24243868638720.0, "grad_norm": 2.5931578566124807, "language_loss": 0.69721985, "learning_rate": 3.7789885889895325e-06, "loss": 0.71888208, "num_input_tokens_seen": 63465525, "step": 2936, "time_per_iteration": 2.7517428398132324 }, { "auxiliary_loss_clip": 0.01105644, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.05023837, "balance_loss_mlp": 1.02737129, "epoch": 0.17658199308582595, "flos": 27454282421760.0, "grad_norm": 1.9170676229980566, "language_loss": 0.71288073, "learning_rate": 3.7788105921955634e-06, "loss": 0.73439616, "num_input_tokens_seen": 63485815, "step": 2937, "time_per_iteration": 2.837181329727173 }, { "auxiliary_loss_clip": 0.01141008, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05945122, "balance_loss_mlp": 1.02674472, "epoch": 0.17664211633849392, "flos": 22418672901120.0, "grad_norm": 2.267148270780071, "language_loss": 0.75439745, "learning_rate": 3.7786325279489184e-06, "loss": 0.77627593, "num_input_tokens_seen": 63503905, "step": 2938, "time_per_iteration": 2.883162021636963 }, { "auxiliary_loss_clip": 0.01147345, "auxiliary_loss_mlp": 0.01043976, "balance_loss_clip": 1.05576169, "balance_loss_mlp": 1.02553487, "epoch": 0.17670223959116188, "flos": 24715124098560.0, "grad_norm": 2.921726967662053, "language_loss": 0.71015209, "learning_rate": 3.7784543962563495e-06, "loss": 0.73206532, "num_input_tokens_seen": 63521985, "step": 2939, "time_per_iteration": 2.6938419342041016 }, { "auxiliary_loss_clip": 0.01160437, "auxiliary_loss_mlp": 0.01046921, "balance_loss_clip": 1.05818558, "balance_loss_mlp": 1.02794337, "epoch": 0.17676236284382985, "flos": 22527051212160.0, "grad_norm": 3.114901170192376, "language_loss": 0.73513985, "learning_rate": 3.7782761971246115e-06, "loss": 0.75721341, "num_input_tokens_seen": 63539830, "step": 2940, "time_per_iteration": 4.145469665527344 }, { "auxiliary_loss_clip": 0.0112582, "auxiliary_loss_mlp": 0.01046611, "balance_loss_clip": 1.05631542, "balance_loss_mlp": 1.02731109, "epoch": 0.1768224860964978, "flos": 12385160161920.0, "grad_norm": 3.071469776016301, "language_loss": 0.85375023, "learning_rate": 3.7780979305604616e-06, "loss": 0.87547457, "num_input_tokens_seen": 63555495, "step": 2941, "time_per_iteration": 4.279599666595459 }, { "auxiliary_loss_clip": 0.01161068, "auxiliary_loss_mlp": 0.01045254, "balance_loss_clip": 1.05717027, "balance_loss_mlp": 1.0257628, "epoch": 0.1768826093491658, "flos": 24353360271360.0, "grad_norm": 2.434766510066968, "language_loss": 0.76885259, "learning_rate": 3.7779195965706607e-06, "loss": 0.79091585, "num_input_tokens_seen": 63575290, "step": 2942, "time_per_iteration": 4.2280871868133545 }, { "auxiliary_loss_clip": 0.01106234, "auxiliary_loss_mlp": 0.00780676, "balance_loss_clip": 1.04992843, "balance_loss_mlp": 1.00087166, "epoch": 0.17694273260183377, "flos": 23587062497280.0, "grad_norm": 3.301743041114179, "language_loss": 0.8024286, "learning_rate": 3.77774119516197e-06, "loss": 0.82129776, "num_input_tokens_seen": 63594670, "step": 2943, "time_per_iteration": 2.8921029567718506 }, { "auxiliary_loss_clip": 0.01132848, "auxiliary_loss_mlp": 0.01052225, "balance_loss_clip": 1.05352235, "balance_loss_mlp": 1.03124392, "epoch": 0.17700285585450173, "flos": 26760991040640.0, "grad_norm": 5.7613375603973465, "language_loss": 0.80809408, "learning_rate": 3.777562726341155e-06, "loss": 0.82994485, "num_input_tokens_seen": 63614780, "step": 2944, "time_per_iteration": 2.692831039428711 }, { "auxiliary_loss_clip": 0.01161854, "auxiliary_loss_mlp": 0.01056825, "balance_loss_clip": 1.05807233, "balance_loss_mlp": 1.03796625, "epoch": 0.1770629791071697, "flos": 42776323320960.0, "grad_norm": 2.4257754996125227, "language_loss": 0.73812854, "learning_rate": 3.7773841901149835e-06, "loss": 0.7603153, "num_input_tokens_seen": 63637190, "step": 2945, "time_per_iteration": 2.782910108566284 }, { "auxiliary_loss_clip": 0.011481, "auxiliary_loss_mlp": 0.01047361, "balance_loss_clip": 1.05756998, "balance_loss_mlp": 1.02862108, "epoch": 0.17712310235983766, "flos": 17345572560000.0, "grad_norm": 2.8106797532110637, "language_loss": 0.7793628, "learning_rate": 3.7772055864902256e-06, "loss": 0.80131739, "num_input_tokens_seen": 63652140, "step": 2946, "time_per_iteration": 4.278741121292114 }, { "auxiliary_loss_clip": 0.01109059, "auxiliary_loss_mlp": 0.01052842, "balance_loss_clip": 1.04997015, "balance_loss_mlp": 1.03341079, "epoch": 0.17718322561250563, "flos": 23878477537920.0, "grad_norm": 2.172386857191393, "language_loss": 0.76068008, "learning_rate": 3.7770269154736535e-06, "loss": 0.7822991, "num_input_tokens_seen": 63671700, "step": 2947, "time_per_iteration": 2.7949914932250977 }, { "auxiliary_loss_clip": 0.0114934, "auxiliary_loss_mlp": 0.01044342, "balance_loss_clip": 1.05480659, "balance_loss_mlp": 1.025388, "epoch": 0.1772433488651736, "flos": 36466352104320.0, "grad_norm": 2.6793588646204745, "language_loss": 0.72557831, "learning_rate": 3.7768481770720424e-06, "loss": 0.74751514, "num_input_tokens_seen": 63691685, "step": 2948, "time_per_iteration": 2.901662826538086 }, { "auxiliary_loss_clip": 0.01151572, "auxiliary_loss_mlp": 0.01050692, "balance_loss_clip": 1.05921662, "balance_loss_mlp": 1.03236949, "epoch": 0.1773034721178416, "flos": 26684716510080.0, "grad_norm": 1.8296543316983853, "language_loss": 0.81782824, "learning_rate": 3.776669371292171e-06, "loss": 0.8398509, "num_input_tokens_seen": 63711720, "step": 2949, "time_per_iteration": 2.7284891605377197 }, { "auxiliary_loss_clip": 0.01080853, "auxiliary_loss_mlp": 0.0100651, "balance_loss_clip": 1.04975748, "balance_loss_mlp": 1.00226629, "epoch": 0.17736359537050955, "flos": 57117467617920.0, "grad_norm": 0.768126622018234, "language_loss": 0.64989161, "learning_rate": 3.7764904981408186e-06, "loss": 0.67076528, "num_input_tokens_seen": 63776280, "step": 2950, "time_per_iteration": 3.2761552333831787 }, { "auxiliary_loss_clip": 0.01121454, "auxiliary_loss_mlp": 0.01045861, "balance_loss_clip": 1.05373287, "balance_loss_mlp": 1.02743077, "epoch": 0.17742371862317752, "flos": 27198203385600.0, "grad_norm": 2.9882590699755927, "language_loss": 0.83619881, "learning_rate": 3.7763115576247686e-06, "loss": 0.85787189, "num_input_tokens_seen": 63797535, "step": 2951, "time_per_iteration": 2.7637627124786377 }, { "auxiliary_loss_clip": 0.01125929, "auxiliary_loss_mlp": 0.01046039, "balance_loss_clip": 1.05109882, "balance_loss_mlp": 1.02682269, "epoch": 0.17748384187584548, "flos": 20959694277120.0, "grad_norm": 2.3133151959471796, "language_loss": 0.80395055, "learning_rate": 3.776132549750806e-06, "loss": 0.82567012, "num_input_tokens_seen": 63817045, "step": 2952, "time_per_iteration": 2.7605957984924316 }, { "auxiliary_loss_clip": 0.01162679, "auxiliary_loss_mlp": 0.01044862, "balance_loss_clip": 1.05858529, "balance_loss_mlp": 1.02513337, "epoch": 0.17754396512851345, "flos": 25009986844800.0, "grad_norm": 2.8185319653472116, "language_loss": 0.79273909, "learning_rate": 3.7759534745257194e-06, "loss": 0.81481451, "num_input_tokens_seen": 63837665, "step": 2953, "time_per_iteration": 2.798912525177002 }, { "auxiliary_loss_clip": 0.0112399, "auxiliary_loss_mlp": 0.01043314, "balance_loss_clip": 1.05482125, "balance_loss_mlp": 1.02470589, "epoch": 0.1776040883811814, "flos": 32051566275840.0, "grad_norm": 2.017710353628998, "language_loss": 0.87963271, "learning_rate": 3.7757743319562994e-06, "loss": 0.90130568, "num_input_tokens_seen": 63858455, "step": 2954, "time_per_iteration": 2.838931083679199 }, { "auxiliary_loss_clip": 0.01144028, "auxiliary_loss_mlp": 0.01052958, "balance_loss_clip": 1.06043494, "balance_loss_mlp": 1.03296697, "epoch": 0.17766421163384938, "flos": 21574125348480.0, "grad_norm": 1.9130853947826985, "language_loss": 0.85313326, "learning_rate": 3.7755951220493386e-06, "loss": 0.87510312, "num_input_tokens_seen": 63876935, "step": 2955, "time_per_iteration": 2.7965714931488037 }, { "auxiliary_loss_clip": 0.01127677, "auxiliary_loss_mlp": 0.01047004, "balance_loss_clip": 1.05093336, "balance_loss_mlp": 1.02660692, "epoch": 0.17772433488651737, "flos": 22419319345920.0, "grad_norm": 18.24238703278013, "language_loss": 0.71152055, "learning_rate": 3.7754158448116327e-06, "loss": 0.73326737, "num_input_tokens_seen": 63896815, "step": 2956, "time_per_iteration": 2.8358442783355713 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.010506, "balance_loss_clip": 1.05813813, "balance_loss_mlp": 1.03156281, "epoch": 0.17778445813918534, "flos": 25629445820160.0, "grad_norm": 2.981126112172262, "language_loss": 0.82881534, "learning_rate": 3.7752365002499795e-06, "loss": 0.85082197, "num_input_tokens_seen": 63916140, "step": 2957, "time_per_iteration": 2.7034976482391357 }, { "auxiliary_loss_clip": 0.01100452, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.04976833, "balance_loss_mlp": 1.02789164, "epoch": 0.1778445813918533, "flos": 25628871202560.0, "grad_norm": 2.7180995933425622, "language_loss": 0.75164193, "learning_rate": 3.7750570883711807e-06, "loss": 0.77311885, "num_input_tokens_seen": 63935220, "step": 2958, "time_per_iteration": 2.8312718868255615 }, { "auxiliary_loss_clip": 0.01146025, "auxiliary_loss_mlp": 0.01043359, "balance_loss_clip": 1.06117964, "balance_loss_mlp": 1.02502513, "epoch": 0.17790470464452127, "flos": 22345522853760.0, "grad_norm": 9.439636088267013, "language_loss": 0.80363399, "learning_rate": 3.7748776091820397e-06, "loss": 0.82552785, "num_input_tokens_seen": 63954550, "step": 2959, "time_per_iteration": 2.722102642059326 }, { "auxiliary_loss_clip": 0.01164621, "auxiliary_loss_mlp": 0.01049069, "balance_loss_clip": 1.05812871, "balance_loss_mlp": 1.02938771, "epoch": 0.17796482789718923, "flos": 18765875214720.0, "grad_norm": 2.62580469975692, "language_loss": 0.51511085, "learning_rate": 3.774698062689362e-06, "loss": 0.53724772, "num_input_tokens_seen": 63972425, "step": 2960, "time_per_iteration": 2.6222047805786133 }, { "auxiliary_loss_clip": 0.01111843, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.05275989, "balance_loss_mlp": 1.03228474, "epoch": 0.1780249511498572, "flos": 23440941970560.0, "grad_norm": 1.7626913000215665, "language_loss": 0.88908094, "learning_rate": 3.7745184488999548e-06, "loss": 0.91072738, "num_input_tokens_seen": 63992165, "step": 2961, "time_per_iteration": 2.8088786602020264 }, { "auxiliary_loss_clip": 0.01116231, "auxiliary_loss_mlp": 0.01054867, "balance_loss_clip": 1.05181062, "balance_loss_mlp": 1.03385067, "epoch": 0.1780850744025252, "flos": 23367468700800.0, "grad_norm": 1.716412227369414, "language_loss": 0.79170465, "learning_rate": 3.774338767820631e-06, "loss": 0.81341565, "num_input_tokens_seen": 64013470, "step": 2962, "time_per_iteration": 2.7546913623809814 }, { "auxiliary_loss_clip": 0.01145526, "auxiliary_loss_mlp": 0.01052794, "balance_loss_clip": 1.05649889, "balance_loss_mlp": 1.03104997, "epoch": 0.17814519765519315, "flos": 13771994319360.0, "grad_norm": 2.3241756501763446, "language_loss": 0.74910223, "learning_rate": 3.774159019458203e-06, "loss": 0.77108544, "num_input_tokens_seen": 64030975, "step": 2963, "time_per_iteration": 2.680356979370117 }, { "auxiliary_loss_clip": 0.01140656, "auxiliary_loss_mlp": 0.01043225, "balance_loss_clip": 1.05769885, "balance_loss_mlp": 1.02347231, "epoch": 0.17820532090786112, "flos": 21976396738560.0, "grad_norm": 1.747536927551571, "language_loss": 0.78837025, "learning_rate": 3.7739792038194877e-06, "loss": 0.81020904, "num_input_tokens_seen": 64050075, "step": 2964, "time_per_iteration": 2.748398780822754 }, { "auxiliary_loss_clip": 0.01151685, "auxiliary_loss_mlp": 0.00776982, "balance_loss_clip": 1.05950594, "balance_loss_mlp": 1.00098181, "epoch": 0.17826544416052909, "flos": 24790752184320.0, "grad_norm": 3.046027397796258, "language_loss": 0.81160808, "learning_rate": 3.7737993209113027e-06, "loss": 0.83089471, "num_input_tokens_seen": 64071920, "step": 2965, "time_per_iteration": 2.8090012073516846 }, { "auxiliary_loss_clip": 0.01151658, "auxiliary_loss_mlp": 0.01047086, "balance_loss_clip": 1.06002402, "balance_loss_mlp": 1.02916884, "epoch": 0.17832556741319705, "flos": 13879582531200.0, "grad_norm": 2.554359630612449, "language_loss": 0.95307338, "learning_rate": 3.7736193707404698e-06, "loss": 0.97506082, "num_input_tokens_seen": 64086835, "step": 2966, "time_per_iteration": 2.7159550189971924 }, { "auxiliary_loss_clip": 0.01112928, "auxiliary_loss_mlp": 0.00777395, "balance_loss_clip": 1.05336046, "balance_loss_mlp": 1.00083637, "epoch": 0.17838569066586502, "flos": 36641703323520.0, "grad_norm": 7.5683867487642065, "language_loss": 0.72833109, "learning_rate": 3.7734393533138127e-06, "loss": 0.74723434, "num_input_tokens_seen": 64107360, "step": 2967, "time_per_iteration": 2.9540669918060303 }, { "auxiliary_loss_clip": 0.01129124, "auxiliary_loss_mlp": 0.01046817, "balance_loss_clip": 1.05574143, "balance_loss_mlp": 1.02775562, "epoch": 0.17844581391853298, "flos": 18727271072640.0, "grad_norm": 2.1617023205672523, "language_loss": 0.76897681, "learning_rate": 3.773259268638157e-06, "loss": 0.7907362, "num_input_tokens_seen": 64124690, "step": 2968, "time_per_iteration": 2.752717971801758 }, { "auxiliary_loss_clip": 0.01085006, "auxiliary_loss_mlp": 0.01044958, "balance_loss_clip": 1.04640651, "balance_loss_mlp": 1.02559829, "epoch": 0.17850593717120097, "flos": 27378259286400.0, "grad_norm": 2.039560504387258, "language_loss": 0.75839806, "learning_rate": 3.7730791167203333e-06, "loss": 0.77969772, "num_input_tokens_seen": 64146315, "step": 2969, "time_per_iteration": 2.9161994457244873 }, { "auxiliary_loss_clip": 0.01075271, "auxiliary_loss_mlp": 0.01013071, "balance_loss_clip": 1.06177902, "balance_loss_mlp": 1.00932813, "epoch": 0.17856606042386894, "flos": 66996025084800.0, "grad_norm": 0.8520394227890811, "language_loss": 0.69012916, "learning_rate": 3.772898897567171e-06, "loss": 0.7110126, "num_input_tokens_seen": 64210875, "step": 2970, "time_per_iteration": 3.3269262313842773 }, { "auxiliary_loss_clip": 0.011313, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.05561864, "balance_loss_mlp": 1.02493763, "epoch": 0.1786261836765369, "flos": 36977001805440.0, "grad_norm": 1.9951166568015506, "language_loss": 0.67617297, "learning_rate": 3.772718611185505e-06, "loss": 0.69792765, "num_input_tokens_seen": 64230740, "step": 2971, "time_per_iteration": 2.8691961765289307 }, { "auxiliary_loss_clip": 0.01110831, "auxiliary_loss_mlp": 0.01052779, "balance_loss_clip": 1.05309939, "balance_loss_mlp": 1.03266823, "epoch": 0.17868630692920487, "flos": 24825441744000.0, "grad_norm": 1.5664358375440484, "language_loss": 0.8971802, "learning_rate": 3.7725382575821717e-06, "loss": 0.91881633, "num_input_tokens_seen": 64252300, "step": 2972, "time_per_iteration": 2.893923759460449 }, { "auxiliary_loss_clip": 0.01124705, "auxiliary_loss_mlp": 0.01055871, "balance_loss_clip": 1.05635929, "balance_loss_mlp": 1.03466403, "epoch": 0.17874643018187283, "flos": 16981977139200.0, "grad_norm": 2.4611679901229153, "language_loss": 0.88593906, "learning_rate": 3.77235783676401e-06, "loss": 0.90774482, "num_input_tokens_seen": 64270105, "step": 2973, "time_per_iteration": 2.7340333461761475 }, { "auxiliary_loss_clip": 0.01164127, "auxiliary_loss_mlp": 0.01047073, "balance_loss_clip": 1.06285155, "balance_loss_mlp": 1.0283215, "epoch": 0.1788065534345408, "flos": 21032233793280.0, "grad_norm": 3.4039298885336557, "language_loss": 0.7668556, "learning_rate": 3.7721773487378615e-06, "loss": 0.78896761, "num_input_tokens_seen": 64287250, "step": 2974, "time_per_iteration": 2.632495403289795 }, { "auxiliary_loss_clip": 0.0114187, "auxiliary_loss_mlp": 0.01053, "balance_loss_clip": 1.06101942, "balance_loss_mlp": 1.03390288, "epoch": 0.17886667668720876, "flos": 23987717775360.0, "grad_norm": 2.484949778027245, "language_loss": 0.74701655, "learning_rate": 3.7719967935105705e-06, "loss": 0.76896524, "num_input_tokens_seen": 64307140, "step": 2975, "time_per_iteration": 2.704012870788574 }, { "auxiliary_loss_clip": 0.01149026, "auxiliary_loss_mlp": 0.01048788, "balance_loss_clip": 1.05678535, "balance_loss_mlp": 1.03004813, "epoch": 0.17892679993987676, "flos": 25739476156800.0, "grad_norm": 1.518747487377626, "language_loss": 0.73032069, "learning_rate": 3.7718161710889833e-06, "loss": 0.75229883, "num_input_tokens_seen": 64328760, "step": 2976, "time_per_iteration": 2.7357017993927 }, { "auxiliary_loss_clip": 0.01150398, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.06239033, "balance_loss_mlp": 1.0229373, "epoch": 0.17898692319254472, "flos": 25699686865920.0, "grad_norm": 1.4579507247258654, "language_loss": 0.770594, "learning_rate": 3.7716354814799495e-06, "loss": 0.79248488, "num_input_tokens_seen": 64348800, "step": 2977, "time_per_iteration": 2.727318286895752 }, { "auxiliary_loss_clip": 0.01131521, "auxiliary_loss_mlp": 0.01045834, "balance_loss_clip": 1.06618452, "balance_loss_mlp": 1.02841735, "epoch": 0.1790470464452127, "flos": 19317786664320.0, "grad_norm": 2.7286854986191282, "language_loss": 0.80235189, "learning_rate": 3.7714547246903203e-06, "loss": 0.82412547, "num_input_tokens_seen": 64367955, "step": 2978, "time_per_iteration": 2.8178791999816895 }, { "auxiliary_loss_clip": 0.0114307, "auxiliary_loss_mlp": 0.01052978, "balance_loss_clip": 1.05818772, "balance_loss_mlp": 1.03330874, "epoch": 0.17910716969788065, "flos": 30044267562240.0, "grad_norm": 1.4967765935497133, "language_loss": 0.76192784, "learning_rate": 3.7712739007269508e-06, "loss": 0.7838884, "num_input_tokens_seen": 64389805, "step": 2979, "time_per_iteration": 4.241487741470337 }, { "auxiliary_loss_clip": 0.01122958, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.0590893, "balance_loss_mlp": 1.02660525, "epoch": 0.17916729295054862, "flos": 19427709260160.0, "grad_norm": 1.9491816848203256, "language_loss": 0.68945503, "learning_rate": 3.7710930095966976e-06, "loss": 0.71113026, "num_input_tokens_seen": 64408220, "step": 2980, "time_per_iteration": 2.6817352771759033 }, { "auxiliary_loss_clip": 0.01152986, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.0588038, "balance_loss_mlp": 1.02497244, "epoch": 0.17922741620321658, "flos": 14611549881600.0, "grad_norm": 1.9134992191513662, "language_loss": 0.70793843, "learning_rate": 3.7709120513064196e-06, "loss": 0.72992027, "num_input_tokens_seen": 64426380, "step": 2981, "time_per_iteration": 4.310532331466675 }, { "auxiliary_loss_clip": 0.01137747, "auxiliary_loss_mlp": 0.01056086, "balance_loss_clip": 1.06083858, "balance_loss_mlp": 1.03686976, "epoch": 0.17928753945588458, "flos": 17165301177600.0, "grad_norm": 2.529665562311581, "language_loss": 0.8190546, "learning_rate": 3.7707310258629796e-06, "loss": 0.84099293, "num_input_tokens_seen": 64444355, "step": 2982, "time_per_iteration": 2.710726261138916 }, { "auxiliary_loss_clip": 0.01162978, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.06181359, "balance_loss_mlp": 1.02306128, "epoch": 0.17934766270855254, "flos": 31395622060800.0, "grad_norm": 1.6440716861921114, "language_loss": 0.83123535, "learning_rate": 3.7705499332732413e-06, "loss": 0.85327524, "num_input_tokens_seen": 64467800, "step": 2983, "time_per_iteration": 2.700378656387329 }, { "auxiliary_loss_clip": 0.01153001, "auxiliary_loss_mlp": 0.01048341, "balance_loss_clip": 1.05694914, "balance_loss_mlp": 1.02932739, "epoch": 0.1794077859612205, "flos": 20814184281600.0, "grad_norm": 1.6703280507743268, "language_loss": 0.85149562, "learning_rate": 3.7703687735440718e-06, "loss": 0.87350899, "num_input_tokens_seen": 64487230, "step": 2984, "time_per_iteration": 2.6529407501220703 }, { "auxiliary_loss_clip": 0.01126981, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.05520201, "balance_loss_mlp": 1.02424896, "epoch": 0.17946790921388847, "flos": 28986447006720.0, "grad_norm": 2.4609160562432053, "language_loss": 0.8935222, "learning_rate": 3.7701875466823416e-06, "loss": 0.9152264, "num_input_tokens_seen": 64509165, "step": 2985, "time_per_iteration": 4.528426170349121 }, { "auxiliary_loss_clip": 0.01160091, "auxiliary_loss_mlp": 0.01040749, "balance_loss_clip": 1.06142831, "balance_loss_mlp": 1.02434587, "epoch": 0.17952803246655644, "flos": 20737406960640.0, "grad_norm": 2.095497349072142, "language_loss": 0.69538593, "learning_rate": 3.770006252694922e-06, "loss": 0.71739429, "num_input_tokens_seen": 64527940, "step": 2986, "time_per_iteration": 2.6890172958374023 }, { "auxiliary_loss_clip": 0.01158556, "auxiliary_loss_mlp": 0.00776, "balance_loss_clip": 1.05752599, "balance_loss_mlp": 1.00081134, "epoch": 0.1795881557192244, "flos": 28255988027520.0, "grad_norm": 2.4599229747435123, "language_loss": 0.77855188, "learning_rate": 3.769824891588688e-06, "loss": 0.79789746, "num_input_tokens_seen": 64545230, "step": 2987, "time_per_iteration": 2.650761842727661 }, { "auxiliary_loss_clip": 0.0116216, "auxiliary_loss_mlp": 0.01043775, "balance_loss_clip": 1.05775642, "balance_loss_mlp": 1.02441502, "epoch": 0.17964827897189237, "flos": 18552027594240.0, "grad_norm": 2.0190394876224467, "language_loss": 0.77958816, "learning_rate": 3.7696434633705164e-06, "loss": 0.80164748, "num_input_tokens_seen": 64563820, "step": 2988, "time_per_iteration": 2.6151437759399414 }, { "auxiliary_loss_clip": 0.01059513, "auxiliary_loss_mlp": 0.00756906, "balance_loss_clip": 1.07071137, "balance_loss_mlp": 1.00131369, "epoch": 0.17970840222456036, "flos": 58165088711040.0, "grad_norm": 0.7650122273387262, "language_loss": 0.62709254, "learning_rate": 3.7694619680472875e-06, "loss": 0.64525676, "num_input_tokens_seen": 64621315, "step": 2989, "time_per_iteration": 3.1990275382995605 }, { "auxiliary_loss_clip": 0.01137168, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.05553865, "balance_loss_mlp": 1.02128983, "epoch": 0.17976852547722832, "flos": 20300805146880.0, "grad_norm": 2.3566032567209483, "language_loss": 0.71070904, "learning_rate": 3.7692804056258837e-06, "loss": 0.73246896, "num_input_tokens_seen": 64639885, "step": 2990, "time_per_iteration": 2.7275335788726807 }, { "auxiliary_loss_clip": 0.01135847, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.05398035, "balance_loss_mlp": 1.02639365, "epoch": 0.1798286487298963, "flos": 39669367685760.0, "grad_norm": 1.8035266350414116, "language_loss": 0.68888462, "learning_rate": 3.7690987761131893e-06, "loss": 0.7106927, "num_input_tokens_seen": 64661220, "step": 2991, "time_per_iteration": 2.8237311840057373 }, { "auxiliary_loss_clip": 0.01104375, "auxiliary_loss_mlp": 0.01046061, "balance_loss_clip": 1.05156851, "balance_loss_mlp": 1.02663028, "epoch": 0.17988877198256426, "flos": 25520313323520.0, "grad_norm": 1.6063564491400402, "language_loss": 0.82933879, "learning_rate": 3.7689170795160924e-06, "loss": 0.85084313, "num_input_tokens_seen": 64682530, "step": 2992, "time_per_iteration": 2.8303778171539307 }, { "auxiliary_loss_clip": 0.01140805, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.05302262, "balance_loss_mlp": 1.0187583, "epoch": 0.17994889523523222, "flos": 18807496099200.0, "grad_norm": 2.076285453641059, "language_loss": 0.82228035, "learning_rate": 3.7687353158414822e-06, "loss": 0.84404445, "num_input_tokens_seen": 64701025, "step": 2993, "time_per_iteration": 2.710369110107422 }, { "auxiliary_loss_clip": 0.01135151, "auxiliary_loss_mlp": 0.01040493, "balance_loss_clip": 1.05135202, "balance_loss_mlp": 1.02236176, "epoch": 0.18000901848790019, "flos": 21104450087040.0, "grad_norm": 1.7027458997386926, "language_loss": 0.78129464, "learning_rate": 3.7685534850962517e-06, "loss": 0.80305111, "num_input_tokens_seen": 64719570, "step": 2994, "time_per_iteration": 2.6666738986968994 }, { "auxiliary_loss_clip": 0.01158877, "auxiliary_loss_mlp": 0.01045455, "balance_loss_clip": 1.05657315, "balance_loss_mlp": 1.02819359, "epoch": 0.18006914174056818, "flos": 19646441130240.0, "grad_norm": 2.4198973911698434, "language_loss": 0.81139499, "learning_rate": 3.768371587287296e-06, "loss": 0.83343828, "num_input_tokens_seen": 64738110, "step": 2995, "time_per_iteration": 2.699521541595459 }, { "auxiliary_loss_clip": 0.01142902, "auxiliary_loss_mlp": 0.01047606, "balance_loss_clip": 1.05350447, "balance_loss_mlp": 1.0310601, "epoch": 0.18012926499323614, "flos": 19499889640320.0, "grad_norm": 1.8607496799697536, "language_loss": 0.84162772, "learning_rate": 3.768189622421512e-06, "loss": 0.86353278, "num_input_tokens_seen": 64756345, "step": 2996, "time_per_iteration": 2.696723461151123 }, { "auxiliary_loss_clip": 0.01127214, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.06094205, "balance_loss_mlp": 1.02273917, "epoch": 0.1801893882459041, "flos": 19464553635840.0, "grad_norm": 2.1291201116421283, "language_loss": 0.88189137, "learning_rate": 3.7680075905058006e-06, "loss": 0.90356302, "num_input_tokens_seen": 64776375, "step": 2997, "time_per_iteration": 2.785522699356079 }, { "auxiliary_loss_clip": 0.01134376, "auxiliary_loss_mlp": 0.01045962, "balance_loss_clip": 1.04949927, "balance_loss_mlp": 1.02753246, "epoch": 0.18024951149857207, "flos": 26870590414080.0, "grad_norm": 1.7579499924576911, "language_loss": 0.85068727, "learning_rate": 3.7678254915470643e-06, "loss": 0.87249064, "num_input_tokens_seen": 64796210, "step": 2998, "time_per_iteration": 2.6912384033203125 }, { "auxiliary_loss_clip": 0.01159537, "auxiliary_loss_mlp": 0.01044427, "balance_loss_clip": 1.06019807, "balance_loss_mlp": 1.02641416, "epoch": 0.18030963475124004, "flos": 30226621933440.0, "grad_norm": 1.8075624565441775, "language_loss": 0.84176779, "learning_rate": 3.7676433255522084e-06, "loss": 0.86380744, "num_input_tokens_seen": 64818590, "step": 2999, "time_per_iteration": 2.722447395324707 }, { "auxiliary_loss_clip": 0.01143605, "auxiliary_loss_mlp": 0.01047321, "balance_loss_clip": 1.05324686, "balance_loss_mlp": 1.02870023, "epoch": 0.180369758003908, "flos": 22307493329280.0, "grad_norm": 1.8789697336390492, "language_loss": 0.75206578, "learning_rate": 3.76746109252814e-06, "loss": 0.77397501, "num_input_tokens_seen": 64838350, "step": 3000, "time_per_iteration": 2.669875144958496 }, { "auxiliary_loss_clip": 0.01130052, "auxiliary_loss_mlp": 0.00775745, "balance_loss_clip": 1.0526886, "balance_loss_mlp": 1.00060582, "epoch": 0.18042988125657597, "flos": 23732033788800.0, "grad_norm": 2.1714361871851704, "language_loss": 0.71088028, "learning_rate": 3.76727879248177e-06, "loss": 0.72993821, "num_input_tokens_seen": 64858065, "step": 3001, "time_per_iteration": 2.7207603454589844 }, { "auxiliary_loss_clip": 0.01150091, "auxiliary_loss_mlp": 0.01044695, "balance_loss_clip": 1.05701649, "balance_loss_mlp": 1.02605033, "epoch": 0.18049000450924396, "flos": 24093582134400.0, "grad_norm": 2.218812983953599, "language_loss": 0.8849982, "learning_rate": 3.767096425420011e-06, "loss": 0.90694606, "num_input_tokens_seen": 64877305, "step": 3002, "time_per_iteration": 2.6577625274658203 }, { "auxiliary_loss_clip": 0.01157827, "auxiliary_loss_mlp": 0.01048268, "balance_loss_clip": 1.05624068, "balance_loss_mlp": 1.03076851, "epoch": 0.18055012776191193, "flos": 22163168482560.0, "grad_norm": 1.6287780165264572, "language_loss": 0.80328667, "learning_rate": 3.7669139913497788e-06, "loss": 0.8253476, "num_input_tokens_seen": 64896955, "step": 3003, "time_per_iteration": 2.6274783611297607 }, { "auxiliary_loss_clip": 0.01158367, "auxiliary_loss_mlp": 0.01043654, "balance_loss_clip": 1.05622995, "balance_loss_mlp": 1.02596307, "epoch": 0.1806102510145799, "flos": 28913512440960.0, "grad_norm": 2.3308952017896956, "language_loss": 0.67250973, "learning_rate": 3.7667314902779907e-06, "loss": 0.69452989, "num_input_tokens_seen": 64917080, "step": 3004, "time_per_iteration": 2.6652631759643555 }, { "auxiliary_loss_clip": 0.01147517, "auxiliary_loss_mlp": 0.01054518, "balance_loss_clip": 1.05606318, "balance_loss_mlp": 1.03528929, "epoch": 0.18067037426724786, "flos": 19025689265280.0, "grad_norm": 2.592432277036083, "language_loss": 0.85111535, "learning_rate": 3.7665489222115677e-06, "loss": 0.87313569, "num_input_tokens_seen": 64935215, "step": 3005, "time_per_iteration": 2.654977560043335 }, { "auxiliary_loss_clip": 0.0114499, "auxiliary_loss_mlp": 0.01041993, "balance_loss_clip": 1.05690646, "balance_loss_mlp": 1.02489829, "epoch": 0.18073049751991582, "flos": 27453635976960.0, "grad_norm": 1.5217876402754629, "language_loss": 0.83215338, "learning_rate": 3.766366287157432e-06, "loss": 0.85402322, "num_input_tokens_seen": 64956275, "step": 3006, "time_per_iteration": 2.7118306159973145 }, { "auxiliary_loss_clip": 0.01127168, "auxiliary_loss_mlp": 0.01050084, "balance_loss_clip": 1.05063033, "balance_loss_mlp": 1.03105807, "epoch": 0.1807906207725838, "flos": 28729039167360.0, "grad_norm": 1.6327495611050657, "language_loss": 0.77377248, "learning_rate": 3.7661835851225103e-06, "loss": 0.79554498, "num_input_tokens_seen": 64979390, "step": 3007, "time_per_iteration": 2.7996537685394287 }, { "auxiliary_loss_clip": 0.01070026, "auxiliary_loss_mlp": 0.01030441, "balance_loss_clip": 1.04936945, "balance_loss_mlp": 1.02712655, "epoch": 0.18085074402525175, "flos": 64466515468800.0, "grad_norm": 0.801982400183398, "language_loss": 0.56987137, "learning_rate": 3.7660008161137294e-06, "loss": 0.5908761, "num_input_tokens_seen": 65043135, "step": 3008, "time_per_iteration": 3.4269092082977295 }, { "auxiliary_loss_clip": 0.01130838, "auxiliary_loss_mlp": 0.01047085, "balance_loss_clip": 1.05308366, "balance_loss_mlp": 1.02686691, "epoch": 0.18091086727791975, "flos": 23476960333440.0, "grad_norm": 1.8424126412451678, "language_loss": 0.67248082, "learning_rate": 3.765817980138021e-06, "loss": 0.69426012, "num_input_tokens_seen": 65062845, "step": 3009, "time_per_iteration": 2.7875866889953613 }, { "auxiliary_loss_clip": 0.01161719, "auxiliary_loss_mlp": 0.01044187, "balance_loss_clip": 1.0595516, "balance_loss_mlp": 1.02673507, "epoch": 0.1809709905305877, "flos": 24170467196160.0, "grad_norm": 2.4429360498363986, "language_loss": 0.75690198, "learning_rate": 3.7656350772023177e-06, "loss": 0.778961, "num_input_tokens_seen": 65082110, "step": 3010, "time_per_iteration": 2.6060268878936768 }, { "auxiliary_loss_clip": 0.01127916, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.05715132, "balance_loss_mlp": 1.02063942, "epoch": 0.18103111378325568, "flos": 21650902669440.0, "grad_norm": 1.6324915654296899, "language_loss": 0.67356348, "learning_rate": 3.7654521073135553e-06, "loss": 0.69522083, "num_input_tokens_seen": 65101985, "step": 3011, "time_per_iteration": 2.763596534729004 }, { "auxiliary_loss_clip": 0.01105034, "auxiliary_loss_mlp": 0.00777475, "balance_loss_clip": 1.04540467, "balance_loss_mlp": 1.00078559, "epoch": 0.18109123703592364, "flos": 53686918356480.0, "grad_norm": 1.551526807882757, "language_loss": 0.71288514, "learning_rate": 3.7652690704786723e-06, "loss": 0.73171026, "num_input_tokens_seen": 65129295, "step": 3012, "time_per_iteration": 3.037775993347168 }, { "auxiliary_loss_clip": 0.01132189, "auxiliary_loss_mlp": 0.01052085, "balance_loss_clip": 1.05564284, "balance_loss_mlp": 1.03348863, "epoch": 0.1811513602885916, "flos": 35845564325760.0, "grad_norm": 2.095737131475866, "language_loss": 0.62309992, "learning_rate": 3.765085966704609e-06, "loss": 0.64494264, "num_input_tokens_seen": 65150625, "step": 3013, "time_per_iteration": 2.7692227363586426 }, { "auxiliary_loss_clip": 0.01131323, "auxiliary_loss_mlp": 0.0105253, "balance_loss_clip": 1.05343401, "balance_loss_mlp": 1.03486276, "epoch": 0.18121148354125957, "flos": 23732572492800.0, "grad_norm": 1.6679267545988328, "language_loss": 0.76147234, "learning_rate": 3.764902795998309e-06, "loss": 0.78331089, "num_input_tokens_seen": 65170880, "step": 3014, "time_per_iteration": 2.7296786308288574 }, { "auxiliary_loss_clip": 0.01163543, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.05964816, "balance_loss_mlp": 1.02987087, "epoch": 0.18127160679392756, "flos": 28728320895360.0, "grad_norm": 2.1234423596691796, "language_loss": 0.66310829, "learning_rate": 3.7647195583667184e-06, "loss": 0.6852442, "num_input_tokens_seen": 65192530, "step": 3015, "time_per_iteration": 2.7575571537017822 }, { "auxiliary_loss_clip": 0.0113004, "auxiliary_loss_mlp": 0.00776613, "balance_loss_clip": 1.05429327, "balance_loss_mlp": 1.00067461, "epoch": 0.18133173004659553, "flos": 20485062938880.0, "grad_norm": 1.7837261279259933, "language_loss": 0.78152305, "learning_rate": 3.764536253816785e-06, "loss": 0.80058956, "num_input_tokens_seen": 65211675, "step": 3016, "time_per_iteration": 2.6718828678131104 }, { "auxiliary_loss_clip": 0.01145073, "auxiliary_loss_mlp": 0.01049504, "balance_loss_clip": 1.05684161, "balance_loss_mlp": 1.03068125, "epoch": 0.1813918532992635, "flos": 22852078404480.0, "grad_norm": 1.7248072345223011, "language_loss": 0.8351965, "learning_rate": 3.7643528823554602e-06, "loss": 0.85714233, "num_input_tokens_seen": 65231185, "step": 3017, "time_per_iteration": 2.6879045963287354 }, { "auxiliary_loss_clip": 0.0114091, "auxiliary_loss_mlp": 0.01042994, "balance_loss_clip": 1.05404854, "balance_loss_mlp": 1.02539897, "epoch": 0.18145197655193146, "flos": 36065122208640.0, "grad_norm": 2.2664795482488924, "language_loss": 0.6769017, "learning_rate": 3.764169443989697e-06, "loss": 0.69874066, "num_input_tokens_seen": 65251645, "step": 3018, "time_per_iteration": 4.31333327293396 }, { "auxiliary_loss_clip": 0.01147629, "auxiliary_loss_mlp": 0.00776661, "balance_loss_clip": 1.05706179, "balance_loss_mlp": 1.00074184, "epoch": 0.18151209980459942, "flos": 24023951619840.0, "grad_norm": 1.8935259017451227, "language_loss": 0.76396847, "learning_rate": 3.7639859387264518e-06, "loss": 0.78321135, "num_input_tokens_seen": 65271125, "step": 3019, "time_per_iteration": 2.7667160034179688 }, { "auxiliary_loss_clip": 0.01121465, "auxiliary_loss_mlp": 0.01046742, "balance_loss_clip": 1.05550635, "balance_loss_mlp": 1.02722728, "epoch": 0.1815722230572674, "flos": 23951627585280.0, "grad_norm": 2.042490471678265, "language_loss": 0.81550395, "learning_rate": 3.7638023665726834e-06, "loss": 0.83718598, "num_input_tokens_seen": 65290600, "step": 3020, "time_per_iteration": 4.3900346755981445 }, { "auxiliary_loss_clip": 0.01136424, "auxiliary_loss_mlp": 0.01046217, "balance_loss_clip": 1.05758023, "balance_loss_mlp": 1.02567708, "epoch": 0.18163234630993536, "flos": 24386469632640.0, "grad_norm": 1.9628186536024828, "language_loss": 0.7757082, "learning_rate": 3.763618727535352e-06, "loss": 0.79753458, "num_input_tokens_seen": 65311040, "step": 3021, "time_per_iteration": 4.3029396533966064 }, { "auxiliary_loss_clip": 0.01143245, "auxiliary_loss_mlp": 0.01047278, "balance_loss_clip": 1.05453348, "balance_loss_mlp": 1.02907431, "epoch": 0.18169246956260335, "flos": 24681332378880.0, "grad_norm": 1.725306643191844, "language_loss": 0.84863859, "learning_rate": 3.763435021621422e-06, "loss": 0.87054378, "num_input_tokens_seen": 65332115, "step": 3022, "time_per_iteration": 2.7353312969207764 }, { "auxiliary_loss_clip": 0.01132435, "auxiliary_loss_mlp": 0.01042747, "balance_loss_clip": 1.05769348, "balance_loss_mlp": 1.0235188, "epoch": 0.1817525928152713, "flos": 24243294021120.0, "grad_norm": 2.230341519134859, "language_loss": 0.69367266, "learning_rate": 3.763251248837859e-06, "loss": 0.71542448, "num_input_tokens_seen": 65352210, "step": 3023, "time_per_iteration": 2.775200605392456 }, { "auxiliary_loss_clip": 0.01127605, "auxiliary_loss_mlp": 0.01043947, "balance_loss_clip": 1.04900002, "balance_loss_mlp": 1.02556491, "epoch": 0.18181271606793928, "flos": 16472081623680.0, "grad_norm": 2.150764188548567, "language_loss": 0.74107385, "learning_rate": 3.7630674091916317e-06, "loss": 0.76278937, "num_input_tokens_seen": 65370600, "step": 3024, "time_per_iteration": 2.7364041805267334 }, { "auxiliary_loss_clip": 0.01145205, "auxiliary_loss_mlp": 0.01046837, "balance_loss_clip": 1.05719447, "balance_loss_mlp": 1.02900314, "epoch": 0.18187283932060724, "flos": 18581042805120.0, "grad_norm": 2.148591016046099, "language_loss": 0.8835662, "learning_rate": 3.7628835026897123e-06, "loss": 0.90548658, "num_input_tokens_seen": 65387270, "step": 3025, "time_per_iteration": 4.274658679962158 }, { "auxiliary_loss_clip": 0.01133667, "auxiliary_loss_mlp": 0.01050575, "balance_loss_clip": 1.05470932, "balance_loss_mlp": 1.03137028, "epoch": 0.1819329625732752, "flos": 20266833859200.0, "grad_norm": 3.6399614210311206, "language_loss": 0.79041791, "learning_rate": 3.7626995293390735e-06, "loss": 0.81226033, "num_input_tokens_seen": 65406550, "step": 3026, "time_per_iteration": 2.7589778900146484 }, { "auxiliary_loss_clip": 0.01132736, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.05774415, "balance_loss_mlp": 1.03679442, "epoch": 0.18199308582594317, "flos": 25915186512000.0, "grad_norm": 1.6980721374313217, "language_loss": 0.759978, "learning_rate": 3.762515489146692e-06, "loss": 0.78185904, "num_input_tokens_seen": 65425955, "step": 3027, "time_per_iteration": 2.7347826957702637 }, { "auxiliary_loss_clip": 0.01163558, "auxiliary_loss_mlp": 0.01053369, "balance_loss_clip": 1.05835891, "balance_loss_mlp": 1.03378284, "epoch": 0.18205320907861114, "flos": 15377524433280.0, "grad_norm": 2.2893837743041368, "language_loss": 0.85592651, "learning_rate": 3.762331382119546e-06, "loss": 0.87809575, "num_input_tokens_seen": 65442820, "step": 3028, "time_per_iteration": 2.598905563354492 }, { "auxiliary_loss_clip": 0.01156921, "auxiliary_loss_mlp": 0.0104449, "balance_loss_clip": 1.0578618, "balance_loss_mlp": 1.0260129, "epoch": 0.18211333233127913, "flos": 25624310175360.0, "grad_norm": 1.8897570500397638, "language_loss": 0.82807779, "learning_rate": 3.7621472082646183e-06, "loss": 0.85009193, "num_input_tokens_seen": 65461825, "step": 3029, "time_per_iteration": 2.677332639694214 }, { "auxiliary_loss_clip": 0.01114993, "auxiliary_loss_mlp": 0.01050232, "balance_loss_clip": 1.05223596, "balance_loss_mlp": 1.02931094, "epoch": 0.1821734555839471, "flos": 14976007228800.0, "grad_norm": 10.840079090220346, "language_loss": 0.78091359, "learning_rate": 3.761962967588891e-06, "loss": 0.80256593, "num_input_tokens_seen": 65479480, "step": 3030, "time_per_iteration": 2.6865499019622803 }, { "auxiliary_loss_clip": 0.01139676, "auxiliary_loss_mlp": 0.01043273, "balance_loss_clip": 1.05401075, "balance_loss_mlp": 1.0240562, "epoch": 0.18223357883661506, "flos": 20194007034240.0, "grad_norm": 2.05958060196279, "language_loss": 0.85162055, "learning_rate": 3.761778660099352e-06, "loss": 0.87345004, "num_input_tokens_seen": 65497775, "step": 3031, "time_per_iteration": 2.6336488723754883 }, { "auxiliary_loss_clip": 0.01116657, "auxiliary_loss_mlp": 0.00776186, "balance_loss_clip": 1.0497843, "balance_loss_mlp": 1.00052071, "epoch": 0.18229370208928303, "flos": 15231978524160.0, "grad_norm": 1.83501853384953, "language_loss": 0.79992211, "learning_rate": 3.76159428580299e-06, "loss": 0.81885058, "num_input_tokens_seen": 65516505, "step": 3032, "time_per_iteration": 2.6879780292510986 }, { "auxiliary_loss_clip": 0.01166412, "auxiliary_loss_mlp": 0.01048902, "balance_loss_clip": 1.06163025, "balance_loss_mlp": 1.03038836, "epoch": 0.182353825341951, "flos": 23840483927040.0, "grad_norm": 1.8132660189598853, "language_loss": 0.81316388, "learning_rate": 3.761409844706795e-06, "loss": 0.83531702, "num_input_tokens_seen": 65536160, "step": 3033, "time_per_iteration": 2.628100872039795 }, { "auxiliary_loss_clip": 0.01048591, "auxiliary_loss_mlp": 0.0100128, "balance_loss_clip": 1.05392861, "balance_loss_mlp": 0.99850291, "epoch": 0.18241394859461896, "flos": 61190957393280.0, "grad_norm": 0.8825814513625035, "language_loss": 0.63439631, "learning_rate": 3.7612253368177625e-06, "loss": 0.65489495, "num_input_tokens_seen": 65589375, "step": 3034, "time_per_iteration": 3.2329187393188477 }, { "auxiliary_loss_clip": 0.0112853, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.05698252, "balance_loss_mlp": 1.02384114, "epoch": 0.18247407184728695, "flos": 18471694826880.0, "grad_norm": 3.107937736318082, "language_loss": 0.79893476, "learning_rate": 3.7610407621428893e-06, "loss": 0.82063049, "num_input_tokens_seen": 65606720, "step": 3035, "time_per_iteration": 2.7644357681274414 }, { "auxiliary_loss_clip": 0.01134115, "auxiliary_loss_mlp": 0.01046396, "balance_loss_clip": 1.05675578, "balance_loss_mlp": 1.02906322, "epoch": 0.18253419509995492, "flos": 21795191602560.0, "grad_norm": 1.870086430131469, "language_loss": 0.85076666, "learning_rate": 3.7608561206891735e-06, "loss": 0.87257177, "num_input_tokens_seen": 65625495, "step": 3036, "time_per_iteration": 2.7102303504943848 }, { "auxiliary_loss_clip": 0.01140083, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.05572963, "balance_loss_mlp": 1.02192414, "epoch": 0.18259431835262288, "flos": 20149764456960.0, "grad_norm": 2.1821496235124727, "language_loss": 0.80254716, "learning_rate": 3.760671412463617e-06, "loss": 0.82433879, "num_input_tokens_seen": 65643515, "step": 3037, "time_per_iteration": 2.6703832149505615 }, { "auxiliary_loss_clip": 0.01139652, "auxiliary_loss_mlp": 0.00776941, "balance_loss_clip": 1.05986989, "balance_loss_mlp": 1.00062871, "epoch": 0.18265444160529085, "flos": 16981653916800.0, "grad_norm": 3.0764011293768023, "language_loss": 0.7950514, "learning_rate": 3.7604866374732246e-06, "loss": 0.81421733, "num_input_tokens_seen": 65658155, "step": 3038, "time_per_iteration": 2.7410895824432373 }, { "auxiliary_loss_clip": 0.01125628, "auxiliary_loss_mlp": 0.01044597, "balance_loss_clip": 1.05254972, "balance_loss_mlp": 1.02551126, "epoch": 0.1827145648579588, "flos": 34423250509440.0, "grad_norm": 1.9524772610579864, "language_loss": 0.67722493, "learning_rate": 3.7603017957250023e-06, "loss": 0.69892722, "num_input_tokens_seen": 65679310, "step": 3039, "time_per_iteration": 2.756833076477051 }, { "auxiliary_loss_clip": 0.0113051, "auxiliary_loss_mlp": 0.01051065, "balance_loss_clip": 1.053087, "balance_loss_mlp": 1.03304029, "epoch": 0.18277468811062678, "flos": 53287017264000.0, "grad_norm": 1.8757227718998248, "language_loss": 0.73394251, "learning_rate": 3.7601168872259593e-06, "loss": 0.75575823, "num_input_tokens_seen": 65705235, "step": 3040, "time_per_iteration": 3.026679039001465 }, { "auxiliary_loss_clip": 0.01143558, "auxiliary_loss_mlp": 0.01042261, "balance_loss_clip": 1.05585194, "balance_loss_mlp": 1.02373624, "epoch": 0.18283481136329474, "flos": 31650659602560.0, "grad_norm": 2.017308993436446, "language_loss": 0.60348576, "learning_rate": 3.7599319119831075e-06, "loss": 0.62534392, "num_input_tokens_seen": 65727575, "step": 3041, "time_per_iteration": 2.738554000854492 }, { "auxiliary_loss_clip": 0.01116972, "auxiliary_loss_mlp": 0.01053827, "balance_loss_clip": 1.05058599, "balance_loss_mlp": 1.03544497, "epoch": 0.18289493461596273, "flos": 53137664513280.0, "grad_norm": 2.3558133433802104, "language_loss": 0.59825706, "learning_rate": 3.7597468700034616e-06, "loss": 0.61996508, "num_input_tokens_seen": 65751370, "step": 3042, "time_per_iteration": 3.0009193420410156 }, { "auxiliary_loss_clip": 0.0112422, "auxiliary_loss_mlp": 0.01046569, "balance_loss_clip": 1.05319464, "balance_loss_mlp": 1.02917695, "epoch": 0.1829550578686307, "flos": 25589369220480.0, "grad_norm": 1.5313119565207096, "language_loss": 0.8757726, "learning_rate": 3.7595617612940374e-06, "loss": 0.89748049, "num_input_tokens_seen": 65771040, "step": 3043, "time_per_iteration": 2.7406487464904785 }, { "auxiliary_loss_clip": 0.01056788, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.04592645, "balance_loss_mlp": 1.03712869, "epoch": 0.18301518112129866, "flos": 22601422321920.0, "grad_norm": 2.144378235575635, "language_loss": 0.70980251, "learning_rate": 3.7593765858618552e-06, "loss": 0.73093396, "num_input_tokens_seen": 65789345, "step": 3044, "time_per_iteration": 2.785931348800659 }, { "auxiliary_loss_clip": 0.01105073, "auxiliary_loss_mlp": 0.01059118, "balance_loss_clip": 1.05111921, "balance_loss_mlp": 1.0381608, "epoch": 0.18307530437396663, "flos": 34020799551360.0, "grad_norm": 3.097061979225562, "language_loss": 0.64460731, "learning_rate": 3.7591913437139365e-06, "loss": 0.66624922, "num_input_tokens_seen": 65810990, "step": 3045, "time_per_iteration": 2.8085720539093018 }, { "auxiliary_loss_clip": 0.01155246, "auxiliary_loss_mlp": 0.01044973, "balance_loss_clip": 1.05604315, "balance_loss_mlp": 1.02780676, "epoch": 0.1831354276266346, "flos": 21279765392640.0, "grad_norm": 11.455833434854163, "language_loss": 0.78461385, "learning_rate": 3.7590060348573066e-06, "loss": 0.80661607, "num_input_tokens_seen": 65827230, "step": 3046, "time_per_iteration": 2.603299140930176 }, { "auxiliary_loss_clip": 0.01118725, "auxiliary_loss_mlp": 0.01042864, "balance_loss_clip": 1.04837, "balance_loss_mlp": 1.0240643, "epoch": 0.18319555087930256, "flos": 21032952065280.0, "grad_norm": 1.9889932097770582, "language_loss": 0.78733194, "learning_rate": 3.7588206592989903e-06, "loss": 0.8089478, "num_input_tokens_seen": 65845900, "step": 3047, "time_per_iteration": 2.7109453678131104 }, { "auxiliary_loss_clip": 0.01144516, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.05723858, "balance_loss_mlp": 1.0254705, "epoch": 0.18325567413197055, "flos": 34382958428160.0, "grad_norm": 1.5191744259185578, "language_loss": 0.80704039, "learning_rate": 3.7586352170460194e-06, "loss": 0.82890975, "num_input_tokens_seen": 65868730, "step": 3048, "time_per_iteration": 2.7485053539276123 }, { "auxiliary_loss_clip": 0.01139433, "auxiliary_loss_mlp": 0.01046004, "balance_loss_clip": 1.05405188, "balance_loss_mlp": 1.02552414, "epoch": 0.18331579738463852, "flos": 20558464381440.0, "grad_norm": 2.1437824577601354, "language_loss": 0.86579728, "learning_rate": 3.758449708105424e-06, "loss": 0.88765168, "num_input_tokens_seen": 65888420, "step": 3049, "time_per_iteration": 2.6876962184906006 }, { "auxiliary_loss_clip": 0.01143881, "auxiliary_loss_mlp": 0.01045208, "balance_loss_clip": 1.05379057, "balance_loss_mlp": 1.02544308, "epoch": 0.18337592063730648, "flos": 19607872901760.0, "grad_norm": 2.616661567020713, "language_loss": 0.77827966, "learning_rate": 3.75826413248424e-06, "loss": 0.80017054, "num_input_tokens_seen": 65905840, "step": 3050, "time_per_iteration": 2.5814058780670166 }, { "auxiliary_loss_clip": 0.01126116, "auxiliary_loss_mlp": 0.01041302, "balance_loss_clip": 1.04954183, "balance_loss_mlp": 1.0238502, "epoch": 0.18343604388997445, "flos": 20850885002880.0, "grad_norm": 2.3686375880611656, "language_loss": 0.99064422, "learning_rate": 3.7580784901895035e-06, "loss": 1.01231837, "num_input_tokens_seen": 65922845, "step": 3051, "time_per_iteration": 2.701848268508911 }, { "auxiliary_loss_clip": 0.01125492, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.05189931, "balance_loss_mlp": 1.02078128, "epoch": 0.1834961671426424, "flos": 24394370624640.0, "grad_norm": 2.0338529701436237, "language_loss": 0.8607648, "learning_rate": 3.7578927812282542e-06, "loss": 0.88241673, "num_input_tokens_seen": 65945555, "step": 3052, "time_per_iteration": 2.7252042293548584 }, { "auxiliary_loss_clip": 0.01152967, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.05449986, "balance_loss_mlp": 1.02737474, "epoch": 0.18355629039531038, "flos": 21251612108160.0, "grad_norm": 1.8649432496703628, "language_loss": 0.73393309, "learning_rate": 3.7577070056075356e-06, "loss": 0.7559092, "num_input_tokens_seen": 65963965, "step": 3053, "time_per_iteration": 2.6331369876861572 }, { "auxiliary_loss_clip": 0.01158728, "auxiliary_loss_mlp": 0.01044052, "balance_loss_clip": 1.05783379, "balance_loss_mlp": 1.02565801, "epoch": 0.18361641364797834, "flos": 28656499651200.0, "grad_norm": 1.5358769917973574, "language_loss": 0.61891186, "learning_rate": 3.7575211633343902e-06, "loss": 0.64093965, "num_input_tokens_seen": 65985965, "step": 3054, "time_per_iteration": 2.6792421340942383 }, { "auxiliary_loss_clip": 0.01108826, "auxiliary_loss_mlp": 0.01042654, "balance_loss_clip": 1.05558836, "balance_loss_mlp": 1.02502322, "epoch": 0.18367653690064634, "flos": 20918827578240.0, "grad_norm": 2.2474279661883667, "language_loss": 0.78218341, "learning_rate": 3.7573352544158663e-06, "loss": 0.80369824, "num_input_tokens_seen": 66005645, "step": 3055, "time_per_iteration": 2.778691053390503 }, { "auxiliary_loss_clip": 0.01096638, "auxiliary_loss_mlp": 0.01050677, "balance_loss_clip": 1.05003095, "balance_loss_mlp": 1.03211594, "epoch": 0.1837366601533143, "flos": 28765596234240.0, "grad_norm": 1.8043720478204575, "language_loss": 0.7022509, "learning_rate": 3.757149278859014e-06, "loss": 0.72372401, "num_input_tokens_seen": 66025675, "step": 3056, "time_per_iteration": 2.794254779815674 }, { "auxiliary_loss_clip": 0.01140367, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.05211461, "balance_loss_mlp": 1.02181149, "epoch": 0.18379678340598227, "flos": 21251432540160.0, "grad_norm": 1.8709784760841586, "language_loss": 0.80357504, "learning_rate": 3.7569632366708842e-06, "loss": 0.82536227, "num_input_tokens_seen": 66046125, "step": 3057, "time_per_iteration": 2.644728899002075 }, { "auxiliary_loss_clip": 0.01150041, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.05482352, "balance_loss_mlp": 1.02332497, "epoch": 0.18385690665865023, "flos": 20449619193600.0, "grad_norm": 7.225766788646501, "language_loss": 0.82570755, "learning_rate": 3.756777127858533e-06, "loss": 0.84764576, "num_input_tokens_seen": 66064375, "step": 3058, "time_per_iteration": 4.136845588684082 }, { "auxiliary_loss_clip": 0.01119139, "auxiliary_loss_mlp": 0.00776668, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.00066566, "epoch": 0.1839170299113182, "flos": 26140562398080.0, "grad_norm": 2.277694088171661, "language_loss": 0.85071868, "learning_rate": 3.756590952429017e-06, "loss": 0.86967677, "num_input_tokens_seen": 66084590, "step": 3059, "time_per_iteration": 2.745020866394043 }, { "auxiliary_loss_clip": 0.01151831, "auxiliary_loss_mlp": 0.00775088, "balance_loss_clip": 1.05359423, "balance_loss_mlp": 1.00077426, "epoch": 0.18397715316398616, "flos": 31758032332800.0, "grad_norm": 2.3540516696336216, "language_loss": 0.72983348, "learning_rate": 3.756404710389396e-06, "loss": 0.74910271, "num_input_tokens_seen": 66107105, "step": 3060, "time_per_iteration": 5.792214393615723 }, { "auxiliary_loss_clip": 0.01149482, "auxiliary_loss_mlp": 0.01041417, "balance_loss_clip": 1.05812132, "balance_loss_mlp": 1.02266574, "epoch": 0.18403727641665413, "flos": 24611989173120.0, "grad_norm": 1.5810457302838978, "language_loss": 0.73126459, "learning_rate": 3.7562184017467323e-06, "loss": 0.75317359, "num_input_tokens_seen": 66129295, "step": 3061, "time_per_iteration": 2.754167318344116 }, { "auxiliary_loss_clip": 0.01138281, "auxiliary_loss_mlp": 0.01043599, "balance_loss_clip": 1.05435956, "balance_loss_mlp": 1.02379823, "epoch": 0.18409739966932212, "flos": 23439900476160.0, "grad_norm": 1.8413104246803462, "language_loss": 0.81937188, "learning_rate": 3.7560320265080906e-06, "loss": 0.8411907, "num_input_tokens_seen": 66146910, "step": 3062, "time_per_iteration": 2.7545394897460938 }, { "auxiliary_loss_clip": 0.01144664, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05668104, "balance_loss_mlp": 1.02259111, "epoch": 0.18415752292199009, "flos": 21872112577920.0, "grad_norm": 2.011374259171591, "language_loss": 0.72994816, "learning_rate": 3.7558455846805383e-06, "loss": 0.75180125, "num_input_tokens_seen": 66165370, "step": 3063, "time_per_iteration": 2.738293170928955 }, { "auxiliary_loss_clip": 0.01133824, "auxiliary_loss_mlp": 0.01040987, "balance_loss_clip": 1.05164194, "balance_loss_mlp": 1.02490544, "epoch": 0.18421764617465805, "flos": 25410678036480.0, "grad_norm": 2.2975785147287953, "language_loss": 0.65614092, "learning_rate": 3.7556590762711463e-06, "loss": 0.67788899, "num_input_tokens_seen": 66186210, "step": 3064, "time_per_iteration": 4.404583930969238 }, { "auxiliary_loss_clip": 0.01141547, "auxiliary_loss_mlp": 0.01042996, "balance_loss_clip": 1.05395937, "balance_loss_mlp": 1.02498376, "epoch": 0.18427776942732602, "flos": 27198131558400.0, "grad_norm": 2.1874829734431898, "language_loss": 0.68347883, "learning_rate": 3.7554725012869853e-06, "loss": 0.70532429, "num_input_tokens_seen": 66204800, "step": 3065, "time_per_iteration": 2.7149577140808105 }, { "auxiliary_loss_clip": 0.01136969, "auxiliary_loss_mlp": 0.01045319, "balance_loss_clip": 1.05518305, "balance_loss_mlp": 1.02674615, "epoch": 0.18433789267999398, "flos": 27852351920640.0, "grad_norm": 2.2758854533642925, "language_loss": 0.73142231, "learning_rate": 3.7552858597351318e-06, "loss": 0.75324523, "num_input_tokens_seen": 66222195, "step": 3066, "time_per_iteration": 2.672675609588623 }, { "auxiliary_loss_clip": 0.01125186, "auxiliary_loss_mlp": 0.01043389, "balance_loss_clip": 1.04947495, "balance_loss_mlp": 1.0256983, "epoch": 0.18439801593266195, "flos": 17856940533120.0, "grad_norm": 2.1067167513095444, "language_loss": 0.82191038, "learning_rate": 3.7550991516226622e-06, "loss": 0.8435961, "num_input_tokens_seen": 66239505, "step": 3067, "time_per_iteration": 2.697768211364746 }, { "auxiliary_loss_clip": 0.01082345, "auxiliary_loss_mlp": 0.00756782, "balance_loss_clip": 1.04466891, "balance_loss_mlp": 1.00113225, "epoch": 0.18445813918532994, "flos": 56389522590720.0, "grad_norm": 0.7960107429271657, "language_loss": 0.59750569, "learning_rate": 3.754912376956657e-06, "loss": 0.61589694, "num_input_tokens_seen": 66295695, "step": 3068, "time_per_iteration": 3.0305213928222656 }, { "auxiliary_loss_clip": 0.01127048, "auxiliary_loss_mlp": 0.01041294, "balance_loss_clip": 1.05452299, "balance_loss_mlp": 1.02356791, "epoch": 0.1845182624379979, "flos": 20957180325120.0, "grad_norm": 3.7299324256794244, "language_loss": 0.76434112, "learning_rate": 3.7547255357441987e-06, "loss": 0.78602457, "num_input_tokens_seen": 66315315, "step": 3069, "time_per_iteration": 2.6757962703704834 }, { "auxiliary_loss_clip": 0.01146412, "auxiliary_loss_mlp": 0.010456, "balance_loss_clip": 1.05468106, "balance_loss_mlp": 1.02798057, "epoch": 0.18457838569066587, "flos": 20485170679680.0, "grad_norm": 1.9225240149566294, "language_loss": 0.8491416, "learning_rate": 3.7545386279923718e-06, "loss": 0.87106168, "num_input_tokens_seen": 66333675, "step": 3070, "time_per_iteration": 2.617023229598999 }, { "auxiliary_loss_clip": 0.01127789, "auxiliary_loss_mlp": 0.01043452, "balance_loss_clip": 1.0553112, "balance_loss_mlp": 1.02510571, "epoch": 0.18463850894333383, "flos": 25010022758400.0, "grad_norm": 6.700503585098448, "language_loss": 0.77807182, "learning_rate": 3.754351653708265e-06, "loss": 0.79978424, "num_input_tokens_seen": 66354075, "step": 3071, "time_per_iteration": 2.847329616546631 }, { "auxiliary_loss_clip": 0.01109458, "auxiliary_loss_mlp": 0.01049978, "balance_loss_clip": 1.05054557, "balance_loss_mlp": 1.03154778, "epoch": 0.1846986321960018, "flos": 16800628348800.0, "grad_norm": 2.0836336776071565, "language_loss": 0.77414191, "learning_rate": 3.7541646128989674e-06, "loss": 0.79573631, "num_input_tokens_seen": 66372520, "step": 3072, "time_per_iteration": 2.780921220779419 }, { "auxiliary_loss_clip": 0.01138997, "auxiliary_loss_mlp": 0.01043594, "balance_loss_clip": 1.05106127, "balance_loss_mlp": 1.02465141, "epoch": 0.18475875544866976, "flos": 20814327936000.0, "grad_norm": 4.959080593148226, "language_loss": 0.86546457, "learning_rate": 3.7539775055715715e-06, "loss": 0.88729048, "num_input_tokens_seen": 66390745, "step": 3073, "time_per_iteration": 2.631913661956787 }, { "auxiliary_loss_clip": 0.01158717, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.05862749, "balance_loss_mlp": 1.02366686, "epoch": 0.18481887870133773, "flos": 22601422321920.0, "grad_norm": 2.162700927804164, "language_loss": 0.91831195, "learning_rate": 3.7537903317331732e-06, "loss": 0.94030046, "num_input_tokens_seen": 66410525, "step": 3074, "time_per_iteration": 2.6152567863464355 }, { "auxiliary_loss_clip": 0.01104968, "auxiliary_loss_mlp": 0.01047718, "balance_loss_clip": 1.04757643, "balance_loss_mlp": 1.02763104, "epoch": 0.18487900195400572, "flos": 29458815788160.0, "grad_norm": 1.9967983521568784, "language_loss": 0.64783108, "learning_rate": 3.75360309139087e-06, "loss": 0.66935796, "num_input_tokens_seen": 66432535, "step": 3075, "time_per_iteration": 2.763559103012085 }, { "auxiliary_loss_clip": 0.01135247, "auxiliary_loss_mlp": 0.01046601, "balance_loss_clip": 1.05689573, "balance_loss_mlp": 1.02913702, "epoch": 0.1849391252066737, "flos": 20628777254400.0, "grad_norm": 1.8996898495981898, "language_loss": 0.72803432, "learning_rate": 3.753415784551761e-06, "loss": 0.74985278, "num_input_tokens_seen": 66450620, "step": 3076, "time_per_iteration": 2.76629376411438 }, { "auxiliary_loss_clip": 0.01124833, "auxiliary_loss_mlp": 0.01042344, "balance_loss_clip": 1.0584389, "balance_loss_mlp": 1.0249157, "epoch": 0.18499924845934165, "flos": 14428549065600.0, "grad_norm": 2.4862024108169556, "language_loss": 0.80772626, "learning_rate": 3.7532284112229507e-06, "loss": 0.82939804, "num_input_tokens_seen": 66467865, "step": 3077, "time_per_iteration": 2.7296142578125 }, { "auxiliary_loss_clip": 0.01128471, "auxiliary_loss_mlp": 0.01041495, "balance_loss_clip": 1.05401397, "balance_loss_mlp": 1.02428079, "epoch": 0.18505937171200962, "flos": 23727652329600.0, "grad_norm": 1.8214336253769514, "language_loss": 0.78693211, "learning_rate": 3.7530409714115424e-06, "loss": 0.80863178, "num_input_tokens_seen": 66486245, "step": 3078, "time_per_iteration": 2.715838670730591 }, { "auxiliary_loss_clip": 0.01154963, "auxiliary_loss_mlp": 0.01043373, "balance_loss_clip": 1.05546641, "balance_loss_mlp": 1.02655268, "epoch": 0.18511949496467758, "flos": 25957489754880.0, "grad_norm": 1.7455066055145632, "language_loss": 0.77326959, "learning_rate": 3.7528534651246453e-06, "loss": 0.79525292, "num_input_tokens_seen": 66506510, "step": 3079, "time_per_iteration": 2.674128770828247 }, { "auxiliary_loss_clip": 0.01119079, "auxiliary_loss_mlp": 0.01041512, "balance_loss_clip": 1.04717147, "balance_loss_mlp": 1.02328515, "epoch": 0.18517961821734555, "flos": 42413553912960.0, "grad_norm": 1.885086933557342, "language_loss": 0.82143807, "learning_rate": 3.752665892369369e-06, "loss": 0.84304404, "num_input_tokens_seen": 66530960, "step": 3080, "time_per_iteration": 2.906940460205078 }, { "auxiliary_loss_clip": 0.01123637, "auxiliary_loss_mlp": 0.01044031, "balance_loss_clip": 1.05894399, "balance_loss_mlp": 1.02563691, "epoch": 0.18523974147001354, "flos": 24097568544000.0, "grad_norm": 2.065822240576764, "language_loss": 0.73973286, "learning_rate": 3.7524782531528266e-06, "loss": 0.76140958, "num_input_tokens_seen": 66550275, "step": 3081, "time_per_iteration": 2.7960739135742188 }, { "auxiliary_loss_clip": 0.01126977, "auxiliary_loss_mlp": 0.01051674, "balance_loss_clip": 1.05360913, "balance_loss_mlp": 1.03286242, "epoch": 0.1852998647226815, "flos": 27375278457600.0, "grad_norm": 1.9854893879184425, "language_loss": 0.71991849, "learning_rate": 3.7522905474821334e-06, "loss": 0.74170506, "num_input_tokens_seen": 66569040, "step": 3082, "time_per_iteration": 2.6965079307556152 }, { "auxiliary_loss_clip": 0.01124933, "auxiliary_loss_mlp": 0.01046296, "balance_loss_clip": 1.05649543, "balance_loss_mlp": 1.02694798, "epoch": 0.18535998797534947, "flos": 18332757020160.0, "grad_norm": 2.0424653419479886, "language_loss": 0.69580144, "learning_rate": 3.752102775364407e-06, "loss": 0.71751374, "num_input_tokens_seen": 66587775, "step": 3083, "time_per_iteration": 2.727252721786499 }, { "auxiliary_loss_clip": 0.01122388, "auxiliary_loss_mlp": 0.01046999, "balance_loss_clip": 1.05204451, "balance_loss_mlp": 1.02964258, "epoch": 0.18542011122801744, "flos": 37845859887360.0, "grad_norm": 2.185713468975319, "language_loss": 0.68965334, "learning_rate": 3.751914936806767e-06, "loss": 0.71134722, "num_input_tokens_seen": 66610800, "step": 3084, "time_per_iteration": 2.95849871635437 }, { "auxiliary_loss_clip": 0.01155184, "auxiliary_loss_mlp": 0.01043029, "balance_loss_clip": 1.05578482, "balance_loss_mlp": 1.0257436, "epoch": 0.1854802344806854, "flos": 25186128163200.0, "grad_norm": 1.6859724806626923, "language_loss": 0.77390355, "learning_rate": 3.7517270318163377e-06, "loss": 0.79588568, "num_input_tokens_seen": 66630960, "step": 3085, "time_per_iteration": 2.68961501121521 }, { "auxiliary_loss_clip": 0.01152089, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.05316019, "balance_loss_mlp": 1.03142118, "epoch": 0.18554035773335337, "flos": 26684788337280.0, "grad_norm": 1.993169596996871, "language_loss": 0.73752379, "learning_rate": 3.751539060400244e-06, "loss": 0.75953472, "num_input_tokens_seen": 66650585, "step": 3086, "time_per_iteration": 2.652475595474243 }, { "auxiliary_loss_clip": 0.01142754, "auxiliary_loss_mlp": 0.01049865, "balance_loss_clip": 1.05530787, "balance_loss_mlp": 1.03134012, "epoch": 0.18560048098602133, "flos": 22346887570560.0, "grad_norm": 7.927127736744579, "language_loss": 0.69762361, "learning_rate": 3.7513510225656132e-06, "loss": 0.71954978, "num_input_tokens_seen": 66670045, "step": 3087, "time_per_iteration": 2.668849229812622 }, { "auxiliary_loss_clip": 0.01119022, "auxiliary_loss_mlp": 0.01055302, "balance_loss_clip": 1.05543649, "balance_loss_mlp": 1.03546548, "epoch": 0.18566060423868933, "flos": 17748526308480.0, "grad_norm": 2.1117122734340263, "language_loss": 0.72513628, "learning_rate": 3.7511629183195764e-06, "loss": 0.74687952, "num_input_tokens_seen": 66688790, "step": 3088, "time_per_iteration": 2.7150719165802 }, { "auxiliary_loss_clip": 0.0112638, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.04933047, "balance_loss_mlp": 1.02616334, "epoch": 0.1857207274913573, "flos": 24677274142080.0, "grad_norm": 2.112009927874319, "language_loss": 0.91859758, "learning_rate": 3.7509747476692663e-06, "loss": 0.94030321, "num_input_tokens_seen": 66708090, "step": 3089, "time_per_iteration": 2.7239248752593994 }, { "auxiliary_loss_clip": 0.01104754, "auxiliary_loss_mlp": 0.01046981, "balance_loss_clip": 1.0494597, "balance_loss_mlp": 1.02919531, "epoch": 0.18578085074402526, "flos": 28147825198080.0, "grad_norm": 2.490831087537115, "language_loss": 0.57275403, "learning_rate": 3.7507865106218176e-06, "loss": 0.59427136, "num_input_tokens_seen": 66727320, "step": 3090, "time_per_iteration": 2.8263309001922607 }, { "auxiliary_loss_clip": 0.01125877, "auxiliary_loss_mlp": 0.0104478, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.02636242, "epoch": 0.18584097399669322, "flos": 23951878980480.0, "grad_norm": 1.7797305478565062, "language_loss": 0.81704801, "learning_rate": 3.7505982071843695e-06, "loss": 0.83875453, "num_input_tokens_seen": 66747505, "step": 3091, "time_per_iteration": 2.697525978088379 }, { "auxiliary_loss_clip": 0.01101743, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.04999971, "balance_loss_mlp": 1.03277707, "epoch": 0.18590109724936119, "flos": 17201678676480.0, "grad_norm": 2.0826959244757832, "language_loss": 0.83704746, "learning_rate": 3.7504098373640617e-06, "loss": 0.8585732, "num_input_tokens_seen": 66766425, "step": 3092, "time_per_iteration": 2.8379435539245605 }, { "auxiliary_loss_clip": 0.01136846, "auxiliary_loss_mlp": 0.01048758, "balance_loss_clip": 1.05389428, "balance_loss_mlp": 1.03036356, "epoch": 0.18596122050202915, "flos": 17234644383360.0, "grad_norm": 5.439917179387958, "language_loss": 0.93443698, "learning_rate": 3.750221401168038e-06, "loss": 0.95629299, "num_input_tokens_seen": 66781130, "step": 3093, "time_per_iteration": 2.8053483963012695 }, { "auxiliary_loss_clip": 0.01130362, "auxiliary_loss_mlp": 0.01042367, "balance_loss_clip": 1.05440521, "balance_loss_mlp": 1.02464092, "epoch": 0.18602134375469712, "flos": 19020733188480.0, "grad_norm": 1.7318887555782294, "language_loss": 0.77516603, "learning_rate": 3.750032898603443e-06, "loss": 0.7968933, "num_input_tokens_seen": 66797535, "step": 3094, "time_per_iteration": 2.7402310371398926 }, { "auxiliary_loss_clip": 0.0109741, "auxiliary_loss_mlp": 0.01049219, "balance_loss_clip": 1.0519228, "balance_loss_mlp": 1.0323391, "epoch": 0.1860814670073651, "flos": 50950094417280.0, "grad_norm": 1.7033453736007413, "language_loss": 0.69854707, "learning_rate": 3.749844329677425e-06, "loss": 0.72001338, "num_input_tokens_seen": 66821720, "step": 3095, "time_per_iteration": 3.133192777633667 }, { "auxiliary_loss_clip": 0.01113224, "auxiliary_loss_mlp": 0.010546, "balance_loss_clip": 1.0511899, "balance_loss_mlp": 1.03415525, "epoch": 0.18614159026003307, "flos": 19390972625280.0, "grad_norm": 2.2828801406167307, "language_loss": 0.81214821, "learning_rate": 3.749655694397135e-06, "loss": 0.83382642, "num_input_tokens_seen": 66839060, "step": 3096, "time_per_iteration": 2.7599101066589355 }, { "auxiliary_loss_clip": 0.01147399, "auxiliary_loss_mlp": 0.0104683, "balance_loss_clip": 1.05678356, "balance_loss_mlp": 1.02810192, "epoch": 0.18620171351270104, "flos": 21798782962560.0, "grad_norm": 2.430947734084612, "language_loss": 0.75326216, "learning_rate": 3.7494669927697255e-06, "loss": 0.77520448, "num_input_tokens_seen": 66857760, "step": 3097, "time_per_iteration": 4.255983114242554 }, { "auxiliary_loss_clip": 0.01133757, "auxiliary_loss_mlp": 0.01050365, "balance_loss_clip": 1.05756521, "balance_loss_mlp": 1.03228104, "epoch": 0.186261836765369, "flos": 16362877299840.0, "grad_norm": 2.553895603581972, "language_loss": 0.66602015, "learning_rate": 3.749278224802352e-06, "loss": 0.68786132, "num_input_tokens_seen": 66876460, "step": 3098, "time_per_iteration": 2.723567247390747 }, { "auxiliary_loss_clip": 0.01163461, "auxiliary_loss_mlp": 0.01052357, "balance_loss_clip": 1.05991709, "balance_loss_mlp": 1.03212702, "epoch": 0.18632196001803697, "flos": 23370054480000.0, "grad_norm": 1.6168121451860142, "language_loss": 0.69838905, "learning_rate": 3.7490893905021733e-06, "loss": 0.7205472, "num_input_tokens_seen": 66897960, "step": 3099, "time_per_iteration": 5.687380075454712 }, { "auxiliary_loss_clip": 0.01148363, "auxiliary_loss_mlp": 0.01051556, "balance_loss_clip": 1.05713868, "balance_loss_mlp": 1.03243458, "epoch": 0.18638208327070493, "flos": 22492002516480.0, "grad_norm": 1.7060244708994476, "language_loss": 0.71840072, "learning_rate": 3.7489004898763494e-06, "loss": 0.74039996, "num_input_tokens_seen": 66917675, "step": 3100, "time_per_iteration": 2.6711015701293945 }, { "auxiliary_loss_clip": 0.01138377, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.05749035, "balance_loss_mlp": 1.03133154, "epoch": 0.18644220652337293, "flos": 29165245931520.0, "grad_norm": 1.9639279354826686, "language_loss": 0.80343997, "learning_rate": 3.7487115229320444e-06, "loss": 0.82533038, "num_input_tokens_seen": 66936000, "step": 3101, "time_per_iteration": 2.6996583938598633 }, { "auxiliary_loss_clip": 0.01112778, "auxiliary_loss_mlp": 0.01042097, "balance_loss_clip": 1.05307627, "balance_loss_mlp": 1.02478826, "epoch": 0.1865023297760409, "flos": 24243796811520.0, "grad_norm": 1.8804860702941575, "language_loss": 0.77053607, "learning_rate": 3.7485224896764222e-06, "loss": 0.79208481, "num_input_tokens_seen": 66955700, "step": 3102, "time_per_iteration": 2.726146936416626 }, { "auxiliary_loss_clip": 0.01150817, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.057688, "balance_loss_mlp": 1.0213027, "epoch": 0.18656245302870886, "flos": 19128716449920.0, "grad_norm": 2.314682178811096, "language_loss": 0.76689744, "learning_rate": 3.7483333901166525e-06, "loss": 0.78879869, "num_input_tokens_seen": 66972815, "step": 3103, "time_per_iteration": 4.374122619628906 }, { "auxiliary_loss_clip": 0.01132531, "auxiliary_loss_mlp": 0.0104481, "balance_loss_clip": 1.05477643, "balance_loss_mlp": 1.02671361, "epoch": 0.18662257628137682, "flos": 17786088956160.0, "grad_norm": 1.6956506235876265, "language_loss": 0.79252636, "learning_rate": 3.7481442242599054e-06, "loss": 0.8142997, "num_input_tokens_seen": 66992280, "step": 3104, "time_per_iteration": 2.695012092590332 }, { "auxiliary_loss_clip": 0.01106786, "auxiliary_loss_mlp": 0.01050273, "balance_loss_clip": 1.05117702, "balance_loss_mlp": 1.03096056, "epoch": 0.1866826995340448, "flos": 24024382583040.0, "grad_norm": 2.065624302338532, "language_loss": 0.8496474, "learning_rate": 3.747954992113354e-06, "loss": 0.87121809, "num_input_tokens_seen": 67012220, "step": 3105, "time_per_iteration": 2.761521816253662 }, { "auxiliary_loss_clip": 0.0112324, "auxiliary_loss_mlp": 0.01043689, "balance_loss_clip": 1.05166531, "balance_loss_mlp": 1.02407932, "epoch": 0.18674282278671275, "flos": 26141244756480.0, "grad_norm": 1.8352441384571676, "language_loss": 0.86880243, "learning_rate": 3.7477656936841742e-06, "loss": 0.8904717, "num_input_tokens_seen": 67032030, "step": 3106, "time_per_iteration": 2.785738706588745 }, { "auxiliary_loss_clip": 0.01150222, "auxiliary_loss_mlp": 0.01040973, "balance_loss_clip": 1.0566026, "balance_loss_mlp": 1.02281737, "epoch": 0.18680294603938072, "flos": 19201938324480.0, "grad_norm": 2.128833658771433, "language_loss": 0.78226906, "learning_rate": 3.7475763289795445e-06, "loss": 0.80418098, "num_input_tokens_seen": 67048920, "step": 3107, "time_per_iteration": 2.693995237350464 }, { "auxiliary_loss_clip": 0.01153763, "auxiliary_loss_mlp": 0.01053056, "balance_loss_clip": 1.05873394, "balance_loss_mlp": 1.03341043, "epoch": 0.1868630692920487, "flos": 28544889116160.0, "grad_norm": 3.0927798335187506, "language_loss": 0.74159014, "learning_rate": 3.7473868980066446e-06, "loss": 0.7636584, "num_input_tokens_seen": 67068645, "step": 3108, "time_per_iteration": 2.795715570449829 }, { "auxiliary_loss_clip": 0.01107582, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.05207491, "balance_loss_mlp": 1.02451098, "epoch": 0.18692319254471668, "flos": 17238020261760.0, "grad_norm": 1.6837485322309411, "language_loss": 0.74348569, "learning_rate": 3.747197400772658e-06, "loss": 0.76498872, "num_input_tokens_seen": 67087075, "step": 3109, "time_per_iteration": 2.7627830505371094 }, { "auxiliary_loss_clip": 0.01145572, "auxiliary_loss_mlp": 0.01044117, "balance_loss_clip": 1.05631042, "balance_loss_mlp": 1.02526462, "epoch": 0.18698331579738464, "flos": 23185186156800.0, "grad_norm": 1.499459601293056, "language_loss": 0.84250218, "learning_rate": 3.747007837284772e-06, "loss": 0.86439908, "num_input_tokens_seen": 67108040, "step": 3110, "time_per_iteration": 2.7665328979492188 }, { "auxiliary_loss_clip": 0.01147578, "auxiliary_loss_mlp": 0.01042389, "balance_loss_clip": 1.05929494, "balance_loss_mlp": 1.02381575, "epoch": 0.1870434390500526, "flos": 25516721963520.0, "grad_norm": 1.9108380391903876, "language_loss": 0.84738445, "learning_rate": 3.7468182075501737e-06, "loss": 0.86928415, "num_input_tokens_seen": 67127605, "step": 3111, "time_per_iteration": 2.729233741760254 }, { "auxiliary_loss_clip": 0.01128, "auxiliary_loss_mlp": 0.01044544, "balance_loss_clip": 1.05348754, "balance_loss_mlp": 1.02635229, "epoch": 0.18710356230272057, "flos": 19500823393920.0, "grad_norm": 1.8704338434966796, "language_loss": 0.76875687, "learning_rate": 3.7466285115760536e-06, "loss": 0.79048228, "num_input_tokens_seen": 67145785, "step": 3112, "time_per_iteration": 2.7392494678497314 }, { "auxiliary_loss_clip": 0.0114846, "auxiliary_loss_mlp": 0.0104709, "balance_loss_clip": 1.05636978, "balance_loss_mlp": 1.02913654, "epoch": 0.18716368555538854, "flos": 26760847386240.0, "grad_norm": 1.8996972204761096, "language_loss": 0.64466536, "learning_rate": 3.7464387493696046e-06, "loss": 0.66662085, "num_input_tokens_seen": 67165930, "step": 3113, "time_per_iteration": 2.7393765449523926 }, { "auxiliary_loss_clip": 0.01153807, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.05685568, "balance_loss_mlp": 1.02900672, "epoch": 0.1872238088080565, "flos": 25189827264000.0, "grad_norm": 6.483287708452815, "language_loss": 0.817972, "learning_rate": 3.746248920938024e-06, "loss": 0.83999759, "num_input_tokens_seen": 67185830, "step": 3114, "time_per_iteration": 2.740229368209839 }, { "auxiliary_loss_clip": 0.01104278, "auxiliary_loss_mlp": 0.01050738, "balance_loss_clip": 1.04921412, "balance_loss_mlp": 1.03024614, "epoch": 0.1872839320607245, "flos": 24134305178880.0, "grad_norm": 2.3064843449079175, "language_loss": 0.57413173, "learning_rate": 3.74605902628851e-06, "loss": 0.59568191, "num_input_tokens_seen": 67206930, "step": 3115, "time_per_iteration": 2.811549663543701 }, { "auxiliary_loss_clip": 0.01123025, "auxiliary_loss_mlp": 0.01052226, "balance_loss_clip": 1.05446446, "balance_loss_mlp": 1.03241396, "epoch": 0.18734405531339246, "flos": 21173793292800.0, "grad_norm": 2.577640519639585, "language_loss": 0.70842528, "learning_rate": 3.745869065428261e-06, "loss": 0.73017788, "num_input_tokens_seen": 67226290, "step": 3116, "time_per_iteration": 2.8053951263427734 }, { "auxiliary_loss_clip": 0.0115042, "auxiliary_loss_mlp": 0.01035569, "balance_loss_clip": 1.05196476, "balance_loss_mlp": 1.01787841, "epoch": 0.18740417856606043, "flos": 17237697039360.0, "grad_norm": 3.010261965906642, "language_loss": 0.78994375, "learning_rate": 3.7456790383644833e-06, "loss": 0.81180358, "num_input_tokens_seen": 67244410, "step": 3117, "time_per_iteration": 2.819415330886841 }, { "auxiliary_loss_clip": 0.01132901, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.05260777, "balance_loss_mlp": 1.03047204, "epoch": 0.1874643018187284, "flos": 32558049999360.0, "grad_norm": 2.2828109389679865, "language_loss": 0.83903432, "learning_rate": 3.745488945104381e-06, "loss": 0.86086059, "num_input_tokens_seen": 67264470, "step": 3118, "time_per_iteration": 2.783804416656494 }, { "auxiliary_loss_clip": 0.01144867, "auxiliary_loss_mlp": 0.0104452, "balance_loss_clip": 1.05412436, "balance_loss_mlp": 1.02688873, "epoch": 0.18752442507139636, "flos": 23258156636160.0, "grad_norm": 3.566737352043019, "language_loss": 0.76283264, "learning_rate": 3.7452987856551636e-06, "loss": 0.78472656, "num_input_tokens_seen": 67284315, "step": 3119, "time_per_iteration": 2.6872506141662598 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01046653, "balance_loss_clip": 1.05519438, "balance_loss_mlp": 1.02899814, "epoch": 0.18758454832406432, "flos": 21760933006080.0, "grad_norm": 1.7224942549361077, "language_loss": 0.82017547, "learning_rate": 3.7451085600240406e-06, "loss": 0.84221041, "num_input_tokens_seen": 67302780, "step": 3120, "time_per_iteration": 2.637505292892456 }, { "auxiliary_loss_clip": 0.0113033, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.05060756, "balance_loss_mlp": 1.01828837, "epoch": 0.1876446715767323, "flos": 29570210841600.0, "grad_norm": 2.5027223446471982, "language_loss": 0.84992659, "learning_rate": 3.7449182682182263e-06, "loss": 0.87158525, "num_input_tokens_seen": 67323405, "step": 3121, "time_per_iteration": 2.788353681564331 }, { "auxiliary_loss_clip": 0.01096681, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.045645, "balance_loss_mlp": 1.02599168, "epoch": 0.18770479482940028, "flos": 30339992234880.0, "grad_norm": 2.1738591443482362, "language_loss": 0.70032287, "learning_rate": 3.744727910244937e-06, "loss": 0.72173256, "num_input_tokens_seen": 67345800, "step": 3122, "time_per_iteration": 3.0225250720977783 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.05445123, "balance_loss_mlp": 1.02288795, "epoch": 0.18776491808206824, "flos": 14465357527680.0, "grad_norm": 4.839579375412361, "language_loss": 0.70661515, "learning_rate": 3.7445374861113905e-06, "loss": 0.72857308, "num_input_tokens_seen": 67363575, "step": 3123, "time_per_iteration": 2.779904365539551 }, { "auxiliary_loss_clip": 0.01142265, "auxiliary_loss_mlp": 0.01041425, "balance_loss_clip": 1.05286181, "balance_loss_mlp": 1.02454507, "epoch": 0.1878250413347362, "flos": 24498547044480.0, "grad_norm": 2.057520579072589, "language_loss": 0.74103826, "learning_rate": 3.7443469958248066e-06, "loss": 0.76287514, "num_input_tokens_seen": 67381765, "step": 3124, "time_per_iteration": 2.6336071491241455 }, { "auxiliary_loss_clip": 0.01157579, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.05653572, "balance_loss_mlp": 1.03333998, "epoch": 0.18788516458740417, "flos": 39786185692800.0, "grad_norm": 3.0670363966795096, "language_loss": 0.80654436, "learning_rate": 3.7441564393924106e-06, "loss": 0.82864523, "num_input_tokens_seen": 67405000, "step": 3125, "time_per_iteration": 2.7224199771881104 }, { "auxiliary_loss_clip": 0.01046615, "auxiliary_loss_mlp": 0.01006504, "balance_loss_clip": 1.04444218, "balance_loss_mlp": 1.00435853, "epoch": 0.18794528784007214, "flos": 64699250664960.0, "grad_norm": 0.9424570711133922, "language_loss": 0.63647306, "learning_rate": 3.7439658168214273e-06, "loss": 0.65700436, "num_input_tokens_seen": 67467140, "step": 3126, "time_per_iteration": 3.313321113586426 }, { "auxiliary_loss_clip": 0.01128308, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.05377257, "balance_loss_mlp": 1.02236164, "epoch": 0.1880054110927401, "flos": 28622061486720.0, "grad_norm": 1.8734163453478039, "language_loss": 0.81308508, "learning_rate": 3.7437751281190857e-06, "loss": 0.83477271, "num_input_tokens_seen": 67487980, "step": 3127, "time_per_iteration": 2.7137866020202637 }, { "auxiliary_loss_clip": 0.01088267, "auxiliary_loss_mlp": 0.0101138, "balance_loss_clip": 1.04814553, "balance_loss_mlp": 1.00912714, "epoch": 0.1880655343454081, "flos": 64488958490880.0, "grad_norm": 0.7699217277386954, "language_loss": 0.61922526, "learning_rate": 3.7435843732926164e-06, "loss": 0.64022171, "num_input_tokens_seen": 67552500, "step": 3128, "time_per_iteration": 3.264270782470703 }, { "auxiliary_loss_clip": 0.01108205, "auxiliary_loss_mlp": 0.01049422, "balance_loss_clip": 1.04763842, "balance_loss_mlp": 1.02907288, "epoch": 0.18812565759807606, "flos": 32124464928000.0, "grad_norm": 2.4867495334212175, "language_loss": 0.70985162, "learning_rate": 3.7433935523492536e-06, "loss": 0.73142785, "num_input_tokens_seen": 67573295, "step": 3129, "time_per_iteration": 2.79929256439209 }, { "auxiliary_loss_clip": 0.01158485, "auxiliary_loss_mlp": 0.01050611, "balance_loss_clip": 1.05767536, "balance_loss_mlp": 1.03109634, "epoch": 0.18818578085074403, "flos": 20624539449600.0, "grad_norm": 2.4831518001798676, "language_loss": 0.85035253, "learning_rate": 3.7432026652962314e-06, "loss": 0.87244344, "num_input_tokens_seen": 67590010, "step": 3130, "time_per_iteration": 2.60624361038208 }, { "auxiliary_loss_clip": 0.01107202, "auxiliary_loss_mlp": 0.01049966, "balance_loss_clip": 1.04649067, "balance_loss_mlp": 1.03023696, "epoch": 0.188245904103412, "flos": 28840506048000.0, "grad_norm": 9.096753382647533, "language_loss": 0.7643525, "learning_rate": 3.7430117121407897e-06, "loss": 0.7859242, "num_input_tokens_seen": 67611110, "step": 3131, "time_per_iteration": 2.759230136871338 }, { "auxiliary_loss_clip": 0.0112329, "auxiliary_loss_mlp": 0.01049221, "balance_loss_clip": 1.05344164, "balance_loss_mlp": 1.03014708, "epoch": 0.18830602735607996, "flos": 29420319386880.0, "grad_norm": 2.109252219381847, "language_loss": 0.80713749, "learning_rate": 3.74282069289017e-06, "loss": 0.82886261, "num_input_tokens_seen": 67631990, "step": 3132, "time_per_iteration": 2.773817777633667 }, { "auxiliary_loss_clip": 0.01093588, "auxiliary_loss_mlp": 0.00779094, "balance_loss_clip": 1.04652429, "balance_loss_mlp": 1.00091529, "epoch": 0.18836615060874792, "flos": 28872933050880.0, "grad_norm": 2.092242478448591, "language_loss": 0.79653811, "learning_rate": 3.742629607551614e-06, "loss": 0.81526494, "num_input_tokens_seen": 67650490, "step": 3133, "time_per_iteration": 2.7873754501342773 }, { "auxiliary_loss_clip": 0.01119878, "auxiliary_loss_mlp": 0.01059381, "balance_loss_clip": 1.05341148, "balance_loss_mlp": 1.03921056, "epoch": 0.18842627386141592, "flos": 22601673717120.0, "grad_norm": 1.9069857551930867, "language_loss": 0.83001804, "learning_rate": 3.7424384561323698e-06, "loss": 0.85181063, "num_input_tokens_seen": 67668860, "step": 3134, "time_per_iteration": 2.9284298419952393 }, { "auxiliary_loss_clip": 0.01131578, "auxiliary_loss_mlp": 0.01046681, "balance_loss_clip": 1.05168402, "balance_loss_mlp": 1.02802503, "epoch": 0.18848639711408388, "flos": 24573600512640.0, "grad_norm": 2.0376543711114152, "language_loss": 0.82859468, "learning_rate": 3.742247238639684e-06, "loss": 0.85037726, "num_input_tokens_seen": 67690220, "step": 3135, "time_per_iteration": 2.8006811141967773 }, { "auxiliary_loss_clip": 0.01143148, "auxiliary_loss_mlp": 0.01050197, "balance_loss_clip": 1.05505157, "balance_loss_mlp": 1.03146911, "epoch": 0.18854652036675185, "flos": 34166920078080.0, "grad_norm": 1.9728388324049713, "language_loss": 0.78658557, "learning_rate": 3.7420559550808083e-06, "loss": 0.80851901, "num_input_tokens_seen": 67709820, "step": 3136, "time_per_iteration": 4.256143569946289 }, { "auxiliary_loss_clip": 0.01135545, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.05388892, "balance_loss_mlp": 1.03006911, "epoch": 0.1886066436194198, "flos": 24200236592640.0, "grad_norm": 1.7483697887361769, "language_loss": 0.80820233, "learning_rate": 3.741864605462996e-06, "loss": 0.83004391, "num_input_tokens_seen": 67729490, "step": 3137, "time_per_iteration": 2.7538130283355713 }, { "auxiliary_loss_clip": 0.01159054, "auxiliary_loss_mlp": 0.01048373, "balance_loss_clip": 1.05827475, "balance_loss_mlp": 1.03107548, "epoch": 0.18866676687208778, "flos": 21251109317760.0, "grad_norm": 1.9799764624272802, "language_loss": 0.81274408, "learning_rate": 3.741673189793504e-06, "loss": 0.83481836, "num_input_tokens_seen": 67749665, "step": 3138, "time_per_iteration": 4.143909931182861 }, { "auxiliary_loss_clip": 0.01150082, "auxiliary_loss_mlp": 0.01056444, "balance_loss_clip": 1.05626798, "balance_loss_mlp": 1.03713167, "epoch": 0.18872689012475574, "flos": 37308673013760.0, "grad_norm": 2.326218248348143, "language_loss": 0.63655496, "learning_rate": 3.7414817080795896e-06, "loss": 0.65862024, "num_input_tokens_seen": 67776230, "step": 3139, "time_per_iteration": 4.30991268157959 }, { "auxiliary_loss_clip": 0.0115289, "auxiliary_loss_mlp": 0.01043021, "balance_loss_clip": 1.05286491, "balance_loss_mlp": 1.02356625, "epoch": 0.1887870133774237, "flos": 21652303299840.0, "grad_norm": 2.1185902638296525, "language_loss": 0.7148211, "learning_rate": 3.741290160328514e-06, "loss": 0.73678017, "num_input_tokens_seen": 67795080, "step": 3140, "time_per_iteration": 2.6880578994750977 }, { "auxiliary_loss_clip": 0.01154738, "auxiliary_loss_mlp": 0.01043099, "balance_loss_clip": 1.05349982, "balance_loss_mlp": 1.02382278, "epoch": 0.1888471366300917, "flos": 15924659374080.0, "grad_norm": 2.6250212982316574, "language_loss": 0.87069929, "learning_rate": 3.7410985465475412e-06, "loss": 0.89267766, "num_input_tokens_seen": 67813110, "step": 3141, "time_per_iteration": 2.6677181720733643 }, { "auxiliary_loss_clip": 0.01130655, "auxiliary_loss_mlp": 0.01052882, "balance_loss_clip": 1.0507834, "balance_loss_mlp": 1.03243756, "epoch": 0.18890725988275966, "flos": 18551955767040.0, "grad_norm": 1.873404502116747, "language_loss": 0.7744689, "learning_rate": 3.7409068667439378e-06, "loss": 0.79630429, "num_input_tokens_seen": 67831070, "step": 3142, "time_per_iteration": 2.63077449798584 }, { "auxiliary_loss_clip": 0.01128192, "auxiliary_loss_mlp": 0.01038074, "balance_loss_clip": 1.05298221, "balance_loss_mlp": 1.02132463, "epoch": 0.18896738313542763, "flos": 28840865184000.0, "grad_norm": 1.6611052928231447, "language_loss": 0.78867507, "learning_rate": 3.740715120924971e-06, "loss": 0.81033778, "num_input_tokens_seen": 67852170, "step": 3143, "time_per_iteration": 4.417406797409058 }, { "auxiliary_loss_clip": 0.0111986, "auxiliary_loss_mlp": 0.01048019, "balance_loss_clip": 1.05024099, "balance_loss_mlp": 1.02821851, "epoch": 0.1890275063880956, "flos": 22412747157120.0, "grad_norm": 2.855732191409361, "language_loss": 0.71476078, "learning_rate": 3.740523309097912e-06, "loss": 0.73643959, "num_input_tokens_seen": 67869945, "step": 3144, "time_per_iteration": 2.8104894161224365 }, { "auxiliary_loss_clip": 0.01125398, "auxiliary_loss_mlp": 0.01044816, "balance_loss_clip": 1.05102479, "balance_loss_mlp": 1.02492023, "epoch": 0.18908762964076356, "flos": 24243904552320.0, "grad_norm": 2.5973078221757144, "language_loss": 0.73390597, "learning_rate": 3.7403314312700356e-06, "loss": 0.75560808, "num_input_tokens_seen": 67890240, "step": 3145, "time_per_iteration": 2.715609312057495 }, { "auxiliary_loss_clip": 0.01110308, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.04543984, "balance_loss_mlp": 1.02446938, "epoch": 0.18914775289343153, "flos": 16982910892800.0, "grad_norm": 2.915733862437625, "language_loss": 0.76263785, "learning_rate": 3.740139487448616e-06, "loss": 0.78416634, "num_input_tokens_seen": 67907825, "step": 3146, "time_per_iteration": 2.777221202850342 }, { "auxiliary_loss_clip": 0.01092807, "auxiliary_loss_mlp": 0.01049336, "balance_loss_clip": 1.04319823, "balance_loss_mlp": 1.02829611, "epoch": 0.1892078761460995, "flos": 21543781334400.0, "grad_norm": 1.988128972125699, "language_loss": 0.7837925, "learning_rate": 3.7399474776409326e-06, "loss": 0.80521393, "num_input_tokens_seen": 67926670, "step": 3147, "time_per_iteration": 2.8039205074310303 }, { "auxiliary_loss_clip": 0.01143577, "auxiliary_loss_mlp": 0.01042953, "balance_loss_clip": 1.0548687, "balance_loss_mlp": 1.02454758, "epoch": 0.18926799939876748, "flos": 23001538896000.0, "grad_norm": 3.932544798883504, "language_loss": 0.67477876, "learning_rate": 3.739755401854267e-06, "loss": 0.69664401, "num_input_tokens_seen": 67943645, "step": 3148, "time_per_iteration": 2.7273359298706055 }, { "auxiliary_loss_clip": 0.01112331, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.04617155, "balance_loss_mlp": 1.02014899, "epoch": 0.18932812265143545, "flos": 22273019251200.0, "grad_norm": 2.9848849244070315, "language_loss": 0.76207471, "learning_rate": 3.739563260095902e-06, "loss": 0.78358936, "num_input_tokens_seen": 67962345, "step": 3149, "time_per_iteration": 2.8031978607177734 }, { "auxiliary_loss_clip": 0.01130375, "auxiliary_loss_mlp": 0.01045773, "balance_loss_clip": 1.05438852, "balance_loss_mlp": 1.02797484, "epoch": 0.1893882459041034, "flos": 18624423456000.0, "grad_norm": 2.3661599820320136, "language_loss": 0.80378366, "learning_rate": 3.7393710523731245e-06, "loss": 0.82554519, "num_input_tokens_seen": 67979760, "step": 3150, "time_per_iteration": 2.7836129665374756 }, { "auxiliary_loss_clip": 0.01137112, "auxiliary_loss_mlp": 0.0104876, "balance_loss_clip": 1.0528239, "balance_loss_mlp": 1.03019929, "epoch": 0.18944836915677138, "flos": 22892981016960.0, "grad_norm": 2.0711129864945956, "language_loss": 0.85251844, "learning_rate": 3.7391787786932215e-06, "loss": 0.87437713, "num_input_tokens_seen": 67996895, "step": 3151, "time_per_iteration": 2.7782201766967773 }, { "auxiliary_loss_clip": 0.01121267, "auxiliary_loss_mlp": 0.01046776, "balance_loss_clip": 1.05223882, "balance_loss_mlp": 1.02839363, "epoch": 0.18950849240943934, "flos": 26796542526720.0, "grad_norm": 2.1337439707996673, "language_loss": 0.74114192, "learning_rate": 3.7389864390634857e-06, "loss": 0.76282233, "num_input_tokens_seen": 68018365, "step": 3152, "time_per_iteration": 2.8767755031585693 }, { "auxiliary_loss_clip": 0.01120312, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.05119991, "balance_loss_mlp": 1.02463925, "epoch": 0.1895686156621073, "flos": 24971239048320.0, "grad_norm": 1.9471461777193173, "language_loss": 0.75520492, "learning_rate": 3.738794033491209e-06, "loss": 0.77685189, "num_input_tokens_seen": 68037985, "step": 3153, "time_per_iteration": 2.7722980976104736 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01049678, "balance_loss_clip": 1.0559293, "balance_loss_mlp": 1.03102183, "epoch": 0.1896287389147753, "flos": 21944544353280.0, "grad_norm": 2.099749434473157, "language_loss": 0.79984629, "learning_rate": 3.7386015619836887e-06, "loss": 0.82192594, "num_input_tokens_seen": 68057975, "step": 3154, "time_per_iteration": 2.6530587673187256 }, { "auxiliary_loss_clip": 0.01117992, "auxiliary_loss_mlp": 0.01056707, "balance_loss_clip": 1.04851115, "balance_loss_mlp": 1.03536844, "epoch": 0.18968886216744327, "flos": 18179058723840.0, "grad_norm": 3.210440214164498, "language_loss": 0.73046303, "learning_rate": 3.738409024548223e-06, "loss": 0.75220996, "num_input_tokens_seen": 68074175, "step": 3155, "time_per_iteration": 2.729832410812378 }, { "auxiliary_loss_clip": 0.01126019, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.05104291, "balance_loss_mlp": 1.02626419, "epoch": 0.18974898542011123, "flos": 20412487509120.0, "grad_norm": 1.8299076145086866, "language_loss": 0.73869717, "learning_rate": 3.7382164211921136e-06, "loss": 0.76041389, "num_input_tokens_seen": 68095230, "step": 3156, "time_per_iteration": 2.6747231483459473 }, { "auxiliary_loss_clip": 0.01156549, "auxiliary_loss_mlp": 0.0104418, "balance_loss_clip": 1.05489409, "balance_loss_mlp": 1.02645326, "epoch": 0.1898091086727792, "flos": 23985024255360.0, "grad_norm": 1.9629652277148564, "language_loss": 0.68053937, "learning_rate": 3.7380237519226623e-06, "loss": 0.70254672, "num_input_tokens_seen": 68113805, "step": 3157, "time_per_iteration": 2.7092478275299072 }, { "auxiliary_loss_clip": 0.01114914, "auxiliary_loss_mlp": 0.01044181, "balance_loss_clip": 1.04805827, "balance_loss_mlp": 1.02533436, "epoch": 0.18986923192544716, "flos": 27637067756160.0, "grad_norm": 1.7829025355963362, "language_loss": 0.79893303, "learning_rate": 3.737831016747176e-06, "loss": 0.82052404, "num_input_tokens_seen": 68133190, "step": 3158, "time_per_iteration": 2.7921364307403564 }, { "auxiliary_loss_clip": 0.01163231, "auxiliary_loss_mlp": 0.01049502, "balance_loss_clip": 1.05787683, "balance_loss_mlp": 1.02923679, "epoch": 0.18992935517811513, "flos": 25484151306240.0, "grad_norm": 1.856283461980025, "language_loss": 0.72348613, "learning_rate": 3.737638215672964e-06, "loss": 0.74561346, "num_input_tokens_seen": 68152330, "step": 3159, "time_per_iteration": 2.6111273765563965 }, { "auxiliary_loss_clip": 0.01149613, "auxiliary_loss_mlp": 0.01053808, "balance_loss_clip": 1.05840325, "balance_loss_mlp": 1.03386414, "epoch": 0.1899894784307831, "flos": 17420805596160.0, "grad_norm": 2.2573250756933647, "language_loss": 0.84977192, "learning_rate": 3.7374453487073366e-06, "loss": 0.87180614, "num_input_tokens_seen": 68170185, "step": 3160, "time_per_iteration": 2.659259796142578 }, { "auxiliary_loss_clip": 0.01129342, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.05297387, "balance_loss_mlp": 1.03289795, "epoch": 0.19004960168345109, "flos": 27492240119040.0, "grad_norm": 2.752358611011079, "language_loss": 0.73407793, "learning_rate": 3.7372524158576074e-06, "loss": 0.7558704, "num_input_tokens_seen": 68191665, "step": 3161, "time_per_iteration": 2.784040689468384 }, { "auxiliary_loss_clip": 0.01139858, "auxiliary_loss_mlp": 0.0105519, "balance_loss_clip": 1.05456805, "balance_loss_mlp": 1.03476942, "epoch": 0.19010972493611905, "flos": 38654676385920.0, "grad_norm": 1.6629026055958476, "language_loss": 0.8115741, "learning_rate": 3.7370594171310926e-06, "loss": 0.83352458, "num_input_tokens_seen": 68214635, "step": 3162, "time_per_iteration": 2.9375386238098145 }, { "auxiliary_loss_clip": 0.01157449, "auxiliary_loss_mlp": 0.01040035, "balance_loss_clip": 1.05625844, "balance_loss_mlp": 1.02062798, "epoch": 0.19016984818878702, "flos": 19244744357760.0, "grad_norm": 2.448016750033594, "language_loss": 0.75615001, "learning_rate": 3.73686635253511e-06, "loss": 0.77812481, "num_input_tokens_seen": 68232150, "step": 3163, "time_per_iteration": 2.7344541549682617 }, { "auxiliary_loss_clip": 0.0110099, "auxiliary_loss_mlp": 0.01050093, "balance_loss_clip": 1.050578, "balance_loss_mlp": 1.02880192, "epoch": 0.19022997144145498, "flos": 37596891744000.0, "grad_norm": 2.2644227245470514, "language_loss": 0.74093997, "learning_rate": 3.736673222076982e-06, "loss": 0.76245081, "num_input_tokens_seen": 68253370, "step": 3164, "time_per_iteration": 2.9165730476379395 }, { "auxiliary_loss_clip": 0.01141317, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.05518687, "balance_loss_mlp": 1.0195303, "epoch": 0.19029009469412295, "flos": 61530921665280.0, "grad_norm": 1.5484522746055986, "language_loss": 0.66844344, "learning_rate": 3.7364800257640313e-06, "loss": 0.69023699, "num_input_tokens_seen": 68278895, "step": 3165, "time_per_iteration": 3.006096124649048 }, { "auxiliary_loss_clip": 0.01146225, "auxiliary_loss_mlp": 0.0104856, "balance_loss_clip": 1.05512285, "balance_loss_mlp": 1.02848506, "epoch": 0.1903502179467909, "flos": 13954851480960.0, "grad_norm": 2.8598536292657144, "language_loss": 0.74239767, "learning_rate": 3.7362867636035835e-06, "loss": 0.76434553, "num_input_tokens_seen": 68294880, "step": 3166, "time_per_iteration": 2.678844928741455 }, { "auxiliary_loss_clip": 0.01050093, "auxiliary_loss_mlp": 0.01014959, "balance_loss_clip": 1.04342103, "balance_loss_mlp": 1.01201403, "epoch": 0.1904103411994589, "flos": 66899641916160.0, "grad_norm": 0.7754190343967906, "language_loss": 0.50311053, "learning_rate": 3.736093435602968e-06, "loss": 0.52376103, "num_input_tokens_seen": 68359665, "step": 3167, "time_per_iteration": 3.277529239654541 }, { "auxiliary_loss_clip": 0.01138483, "auxiliary_loss_mlp": 0.01051348, "balance_loss_clip": 1.05485487, "balance_loss_mlp": 1.03293037, "epoch": 0.19047046445212687, "flos": 21908741472000.0, "grad_norm": 2.3487387451986192, "language_loss": 0.74504036, "learning_rate": 3.7359000417695156e-06, "loss": 0.76693863, "num_input_tokens_seen": 68378950, "step": 3168, "time_per_iteration": 2.690995216369629 }, { "auxiliary_loss_clip": 0.01040165, "auxiliary_loss_mlp": 0.01023518, "balance_loss_clip": 1.03869283, "balance_loss_mlp": 1.02085996, "epoch": 0.19053058770479483, "flos": 59255156701440.0, "grad_norm": 0.8605055473788603, "language_loss": 0.60079956, "learning_rate": 3.73570658211056e-06, "loss": 0.62143636, "num_input_tokens_seen": 68434235, "step": 3169, "time_per_iteration": 3.2108101844787598 }, { "auxiliary_loss_clip": 0.01103792, "auxiliary_loss_mlp": 0.01056606, "balance_loss_clip": 1.05267787, "balance_loss_mlp": 1.03741288, "epoch": 0.1905907109574628, "flos": 23951304362880.0, "grad_norm": 1.5575975614891868, "language_loss": 0.78179795, "learning_rate": 3.735513056633436e-06, "loss": 0.80340189, "num_input_tokens_seen": 68453830, "step": 3170, "time_per_iteration": 2.832043409347534 }, { "auxiliary_loss_clip": 0.01142047, "auxiliary_loss_mlp": 0.01045041, "balance_loss_clip": 1.05325115, "balance_loss_mlp": 1.02605128, "epoch": 0.19065083421013077, "flos": 20812316774400.0, "grad_norm": 1.7671932984988854, "language_loss": 0.78177166, "learning_rate": 3.7353194653454834e-06, "loss": 0.80364257, "num_input_tokens_seen": 68473005, "step": 3171, "time_per_iteration": 2.7823612689971924 }, { "auxiliary_loss_clip": 0.01158227, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.05499291, "balance_loss_mlp": 1.0285697, "epoch": 0.19071095746279873, "flos": 31284981192960.0, "grad_norm": 2.1976685633770905, "language_loss": 0.77953529, "learning_rate": 3.7351258082540426e-06, "loss": 0.80159104, "num_input_tokens_seen": 68493470, "step": 3172, "time_per_iteration": 2.746279001235962 }, { "auxiliary_loss_clip": 0.01145112, "auxiliary_loss_mlp": 0.01055334, "balance_loss_clip": 1.05438328, "balance_loss_mlp": 1.03703523, "epoch": 0.1907710807154667, "flos": 14356117290240.0, "grad_norm": 1.5258786569967644, "language_loss": 0.80223799, "learning_rate": 3.7349320853664576e-06, "loss": 0.82424247, "num_input_tokens_seen": 68511290, "step": 3173, "time_per_iteration": 2.7396810054779053 }, { "auxiliary_loss_clip": 0.01113266, "auxiliary_loss_mlp": 0.00778142, "balance_loss_clip": 1.04967713, "balance_loss_mlp": 1.00094676, "epoch": 0.1908312039681347, "flos": 26907039740160.0, "grad_norm": 1.5341307852526682, "language_loss": 0.78495061, "learning_rate": 3.7347382966900735e-06, "loss": 0.80386466, "num_input_tokens_seen": 68532575, "step": 3174, "time_per_iteration": 2.8579304218292236 }, { "auxiliary_loss_clip": 0.01106714, "auxiliary_loss_mlp": 0.01047557, "balance_loss_clip": 1.04928994, "balance_loss_mlp": 1.02838778, "epoch": 0.19089132722080265, "flos": 14494695960960.0, "grad_norm": 1.8075853216546063, "language_loss": 0.81067109, "learning_rate": 3.7345444422322395e-06, "loss": 0.83221382, "num_input_tokens_seen": 68548760, "step": 3175, "time_per_iteration": 2.718254804611206 }, { "auxiliary_loss_clip": 0.01080497, "auxiliary_loss_mlp": 0.01053652, "balance_loss_clip": 1.04361629, "balance_loss_mlp": 1.0342685, "epoch": 0.19095145047347062, "flos": 13952876232960.0, "grad_norm": 2.2545261224105873, "language_loss": 0.85529047, "learning_rate": 3.7343505220003067e-06, "loss": 0.87663192, "num_input_tokens_seen": 68563100, "step": 3176, "time_per_iteration": 4.2962729930877686 }, { "auxiliary_loss_clip": 0.0113361, "auxiliary_loss_mlp": 0.01059849, "balance_loss_clip": 1.05418086, "balance_loss_mlp": 1.03928506, "epoch": 0.19101157372613858, "flos": 25301832848640.0, "grad_norm": 2.0896270593066832, "language_loss": 0.813025, "learning_rate": 3.7341565360016285e-06, "loss": 0.83495957, "num_input_tokens_seen": 68581650, "step": 3177, "time_per_iteration": 2.815127372741699 }, { "auxiliary_loss_clip": 0.01122377, "auxiliary_loss_mlp": 0.01044946, "balance_loss_clip": 1.0482533, "balance_loss_mlp": 1.0265398, "epoch": 0.19107169697880655, "flos": 20558212986240.0, "grad_norm": 2.67963335978105, "language_loss": 0.7530241, "learning_rate": 3.73396248424356e-06, "loss": 0.7746973, "num_input_tokens_seen": 68600360, "step": 3178, "time_per_iteration": 4.351228475570679 }, { "auxiliary_loss_clip": 0.01146729, "auxiliary_loss_mlp": 0.01042476, "balance_loss_clip": 1.05574143, "balance_loss_mlp": 1.02458286, "epoch": 0.19113182023147451, "flos": 22163204396160.0, "grad_norm": 4.753014277211421, "language_loss": 0.81381619, "learning_rate": 3.7337683667334606e-06, "loss": 0.83570826, "num_input_tokens_seen": 68617885, "step": 3179, "time_per_iteration": 4.259284019470215 }, { "auxiliary_loss_clip": 0.01147837, "auxiliary_loss_mlp": 0.01048144, "balance_loss_clip": 1.05645823, "balance_loss_mlp": 1.0291661, "epoch": 0.19119194348414248, "flos": 18581796990720.0, "grad_norm": 2.753081884541086, "language_loss": 0.79384613, "learning_rate": 3.733574183478691e-06, "loss": 0.81580591, "num_input_tokens_seen": 68634550, "step": 3180, "time_per_iteration": 2.6609203815460205 }, { "auxiliary_loss_clip": 0.01129361, "auxiliary_loss_mlp": 0.0105402, "balance_loss_clip": 1.05249727, "balance_loss_mlp": 1.03445804, "epoch": 0.19125206673681047, "flos": 19026623018880.0, "grad_norm": 2.660238694189741, "language_loss": 0.79517245, "learning_rate": 3.733379934486615e-06, "loss": 0.81700623, "num_input_tokens_seen": 68651895, "step": 3181, "time_per_iteration": 2.6877176761627197 }, { "auxiliary_loss_clip": 0.0114301, "auxiliary_loss_mlp": 0.01053621, "balance_loss_clip": 1.05339336, "balance_loss_mlp": 1.03527462, "epoch": 0.19131218998947844, "flos": 21690153256320.0, "grad_norm": 2.2179888965480243, "language_loss": 0.74570775, "learning_rate": 3.7331856197645973e-06, "loss": 0.76767409, "num_input_tokens_seen": 68671500, "step": 3182, "time_per_iteration": 4.2829508781433105 }, { "auxiliary_loss_clip": 0.01128679, "auxiliary_loss_mlp": 0.01044063, "balance_loss_clip": 1.05578041, "balance_loss_mlp": 1.02575254, "epoch": 0.1913723132421464, "flos": 18442500048000.0, "grad_norm": 1.7534728284311585, "language_loss": 0.64618582, "learning_rate": 3.7329912393200084e-06, "loss": 0.66791326, "num_input_tokens_seen": 68690570, "step": 3183, "time_per_iteration": 2.7652854919433594 }, { "auxiliary_loss_clip": 0.01132257, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.0512805, "balance_loss_mlp": 1.0311259, "epoch": 0.19143243649481437, "flos": 27160102033920.0, "grad_norm": 1.555926798692704, "language_loss": 0.73459226, "learning_rate": 3.7327967931602173e-06, "loss": 0.75642347, "num_input_tokens_seen": 68709735, "step": 3184, "time_per_iteration": 2.6929056644439697 }, { "auxiliary_loss_clip": 0.01122578, "auxiliary_loss_mlp": 0.01054123, "balance_loss_clip": 1.05015373, "balance_loss_mlp": 1.03347623, "epoch": 0.19149255974748233, "flos": 21718952985600.0, "grad_norm": 2.0989643169058514, "language_loss": 0.87983418, "learning_rate": 3.732602281292598e-06, "loss": 0.9016012, "num_input_tokens_seen": 68727565, "step": 3185, "time_per_iteration": 2.6859230995178223 }, { "auxiliary_loss_clip": 0.01153787, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.05334914, "balance_loss_mlp": 1.02505302, "epoch": 0.1915526830001503, "flos": 22963293889920.0, "grad_norm": 2.4520480945942587, "language_loss": 0.73240852, "learning_rate": 3.7324077037245267e-06, "loss": 0.75439072, "num_input_tokens_seen": 68748110, "step": 3186, "time_per_iteration": 2.6398978233337402 }, { "auxiliary_loss_clip": 0.01132874, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.05609488, "balance_loss_mlp": 1.02379346, "epoch": 0.1916128062528183, "flos": 26140741966080.0, "grad_norm": 2.739457234253781, "language_loss": 0.83550584, "learning_rate": 3.7322130604633825e-06, "loss": 0.85728443, "num_input_tokens_seen": 68769765, "step": 3187, "time_per_iteration": 2.7476372718811035 }, { "auxiliary_loss_clip": 0.01076264, "auxiliary_loss_mlp": 0.01021317, "balance_loss_clip": 1.04604995, "balance_loss_mlp": 1.01892138, "epoch": 0.19167292950548626, "flos": 54925767457920.0, "grad_norm": 0.8659386797819415, "language_loss": 0.55824959, "learning_rate": 3.732018351516544e-06, "loss": 0.57922542, "num_input_tokens_seen": 68826815, "step": 3188, "time_per_iteration": 3.2144031524658203 }, { "auxiliary_loss_clip": 0.01139007, "auxiliary_loss_mlp": 0.01054399, "balance_loss_clip": 1.054564, "balance_loss_mlp": 1.03537333, "epoch": 0.19173305275815422, "flos": 29935601942400.0, "grad_norm": 2.2897904709915573, "language_loss": 0.69839454, "learning_rate": 3.731823576891397e-06, "loss": 0.72032857, "num_input_tokens_seen": 68847585, "step": 3189, "time_per_iteration": 2.7998950481414795 }, { "auxiliary_loss_clip": 0.01118438, "auxiliary_loss_mlp": 0.01038566, "balance_loss_clip": 1.04930174, "balance_loss_mlp": 1.02116132, "epoch": 0.1917931760108222, "flos": 24752471264640.0, "grad_norm": 2.362312815249866, "language_loss": 0.74320328, "learning_rate": 3.7316287365953266e-06, "loss": 0.76477331, "num_input_tokens_seen": 68866620, "step": 3190, "time_per_iteration": 2.7386670112609863 }, { "auxiliary_loss_clip": 0.01111071, "auxiliary_loss_mlp": 0.0106718, "balance_loss_clip": 1.04946983, "balance_loss_mlp": 1.04702199, "epoch": 0.19185329926349015, "flos": 18843550375680.0, "grad_norm": 3.545467698458187, "language_loss": 0.8444041, "learning_rate": 3.73143383063572e-06, "loss": 0.8661865, "num_input_tokens_seen": 68885515, "step": 3191, "time_per_iteration": 2.7025794982910156 }, { "auxiliary_loss_clip": 0.01127894, "auxiliary_loss_mlp": 0.01039849, "balance_loss_clip": 1.05251908, "balance_loss_mlp": 1.02231336, "epoch": 0.19191342251615812, "flos": 22086858038400.0, "grad_norm": 2.0663841109071526, "language_loss": 0.89985192, "learning_rate": 3.73123885901997e-06, "loss": 0.92152941, "num_input_tokens_seen": 68903225, "step": 3192, "time_per_iteration": 2.802852153778076 }, { "auxiliary_loss_clip": 0.01130336, "auxiliary_loss_mlp": 0.01054766, "balance_loss_clip": 1.05716372, "balance_loss_mlp": 1.03509688, "epoch": 0.19197354576882608, "flos": 22199115018240.0, "grad_norm": 2.3467564445058775, "language_loss": 0.75159264, "learning_rate": 3.7310438217554687e-06, "loss": 0.77344358, "num_input_tokens_seen": 68922860, "step": 3193, "time_per_iteration": 2.7680914402008057 }, { "auxiliary_loss_clip": 0.01128303, "auxiliary_loss_mlp": 0.00777332, "balance_loss_clip": 1.05222785, "balance_loss_mlp": 1.00071752, "epoch": 0.19203366902149407, "flos": 24896185580160.0, "grad_norm": 2.078743387775855, "language_loss": 0.75189757, "learning_rate": 3.730848718849612e-06, "loss": 0.77095383, "num_input_tokens_seen": 68943000, "step": 3194, "time_per_iteration": 2.7537553310394287 }, { "auxiliary_loss_clip": 0.01068142, "auxiliary_loss_mlp": 0.01004387, "balance_loss_clip": 1.03910232, "balance_loss_mlp": 1.00182378, "epoch": 0.19209379227416204, "flos": 68416722789120.0, "grad_norm": 0.7955224937316553, "language_loss": 0.68507159, "learning_rate": 3.7306535503097985e-06, "loss": 0.70579696, "num_input_tokens_seen": 69000255, "step": 3195, "time_per_iteration": 3.117191791534424 }, { "auxiliary_loss_clip": 0.01116081, "auxiliary_loss_mlp": 0.01052392, "balance_loss_clip": 1.05205238, "balance_loss_mlp": 1.0320189, "epoch": 0.19215391552683, "flos": 22055185221120.0, "grad_norm": 2.6559439291645757, "language_loss": 0.73141015, "learning_rate": 3.730458316143429e-06, "loss": 0.75309479, "num_input_tokens_seen": 69019665, "step": 3196, "time_per_iteration": 2.7234303951263428 }, { "auxiliary_loss_clip": 0.01139018, "auxiliary_loss_mlp": 0.01044947, "balance_loss_clip": 1.06151462, "balance_loss_mlp": 1.02596927, "epoch": 0.19221403877949797, "flos": 20302959962880.0, "grad_norm": 3.0997718824135734, "language_loss": 0.83654135, "learning_rate": 3.7302630163579068e-06, "loss": 0.85838103, "num_input_tokens_seen": 69039055, "step": 3197, "time_per_iteration": 2.72575306892395 }, { "auxiliary_loss_clip": 0.01086216, "auxiliary_loss_mlp": 0.01055059, "balance_loss_clip": 1.04615641, "balance_loss_mlp": 1.03320754, "epoch": 0.19227416203216594, "flos": 23185329811200.0, "grad_norm": 2.2465298420006383, "language_loss": 0.80656433, "learning_rate": 3.7300676509606373e-06, "loss": 0.82797706, "num_input_tokens_seen": 69056370, "step": 3198, "time_per_iteration": 2.741678237915039 }, { "auxiliary_loss_clip": 0.01135487, "auxiliary_loss_mlp": 0.01056572, "balance_loss_clip": 1.05502987, "balance_loss_mlp": 1.03655636, "epoch": 0.1923342852848339, "flos": 25776607841280.0, "grad_norm": 1.9205907836873994, "language_loss": 0.78993976, "learning_rate": 3.729872219959029e-06, "loss": 0.81186032, "num_input_tokens_seen": 69075915, "step": 3199, "time_per_iteration": 2.7821297645568848 }, { "auxiliary_loss_clip": 0.01116808, "auxiliary_loss_mlp": 0.01056964, "balance_loss_clip": 1.05010581, "balance_loss_mlp": 1.036412, "epoch": 0.19239440853750187, "flos": 17128349061120.0, "grad_norm": 3.662083840248298, "language_loss": 0.83574522, "learning_rate": 3.7296767233604934e-06, "loss": 0.85748297, "num_input_tokens_seen": 69094145, "step": 3200, "time_per_iteration": 2.7095022201538086 }, { "auxiliary_loss_clip": 0.01159025, "auxiliary_loss_mlp": 0.01048823, "balance_loss_clip": 1.05997193, "balance_loss_mlp": 1.03060746, "epoch": 0.19245453179016986, "flos": 16435093593600.0, "grad_norm": 1.9278966392289572, "language_loss": 0.79092836, "learning_rate": 3.729481161172443e-06, "loss": 0.81300688, "num_input_tokens_seen": 69111110, "step": 3201, "time_per_iteration": 2.684979200363159 }, { "auxiliary_loss_clip": 0.01103349, "auxiliary_loss_mlp": 0.01053366, "balance_loss_clip": 1.04825675, "balance_loss_mlp": 1.03418541, "epoch": 0.19251465504283782, "flos": 20230276792320.0, "grad_norm": 2.4062417134527645, "language_loss": 0.69276404, "learning_rate": 3.7292855334022927e-06, "loss": 0.71433127, "num_input_tokens_seen": 69130280, "step": 3202, "time_per_iteration": 2.8284943103790283 }, { "auxiliary_loss_clip": 0.01132334, "auxiliary_loss_mlp": 0.01041011, "balance_loss_clip": 1.05389905, "balance_loss_mlp": 1.02256894, "epoch": 0.1925747782955058, "flos": 19464374067840.0, "grad_norm": 1.9491265782204168, "language_loss": 0.91396749, "learning_rate": 3.7290898400574627e-06, "loss": 0.93570089, "num_input_tokens_seen": 69149570, "step": 3203, "time_per_iteration": 2.802433729171753 }, { "auxiliary_loss_clip": 0.0114953, "auxiliary_loss_mlp": 0.01049732, "balance_loss_clip": 1.05674863, "balance_loss_mlp": 1.02959776, "epoch": 0.19263490154817375, "flos": 17785586165760.0, "grad_norm": 5.05881669068558, "language_loss": 0.81689429, "learning_rate": 3.7288940811453725e-06, "loss": 0.83888692, "num_input_tokens_seen": 69168190, "step": 3204, "time_per_iteration": 2.671285629272461 }, { "auxiliary_loss_clip": 0.01116988, "auxiliary_loss_mlp": 0.01048941, "balance_loss_clip": 1.04950142, "balance_loss_mlp": 1.0298202, "epoch": 0.19269502480084172, "flos": 17457075354240.0, "grad_norm": 2.296941025186916, "language_loss": 0.76167846, "learning_rate": 3.7286982566734454e-06, "loss": 0.78333771, "num_input_tokens_seen": 69186950, "step": 3205, "time_per_iteration": 2.8654470443725586 }, { "auxiliary_loss_clip": 0.01140852, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.05839586, "balance_loss_mlp": 1.02749407, "epoch": 0.19275514805350968, "flos": 21506901045120.0, "grad_norm": 3.761768843322395, "language_loss": 0.83394569, "learning_rate": 3.728502366649107e-06, "loss": 0.85582072, "num_input_tokens_seen": 69204850, "step": 3206, "time_per_iteration": 2.8610613346099854 }, { "auxiliary_loss_clip": 0.0105715, "auxiliary_loss_mlp": 0.01004055, "balance_loss_clip": 1.03779244, "balance_loss_mlp": 1.00174224, "epoch": 0.19281527130617768, "flos": 47695979738880.0, "grad_norm": 0.8644529519848262, "language_loss": 0.60561717, "learning_rate": 3.728306411079786e-06, "loss": 0.62622917, "num_input_tokens_seen": 69259200, "step": 3207, "time_per_iteration": 3.126537322998047 }, { "auxiliary_loss_clip": 0.01120285, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.05201781, "balance_loss_mlp": 1.02678764, "epoch": 0.19287539455884564, "flos": 11801252672640.0, "grad_norm": 2.296187182186814, "language_loss": 0.75463599, "learning_rate": 3.7281103899729125e-06, "loss": 0.77629405, "num_input_tokens_seen": 69275835, "step": 3208, "time_per_iteration": 2.6978750228881836 }, { "auxiliary_loss_clip": 0.01150534, "auxiliary_loss_mlp": 0.00777875, "balance_loss_clip": 1.05520236, "balance_loss_mlp": 1.00063884, "epoch": 0.1929355178115136, "flos": 20631434860800.0, "grad_norm": 1.9483983315924505, "language_loss": 0.60869855, "learning_rate": 3.7279143033359195e-06, "loss": 0.62798262, "num_input_tokens_seen": 69294810, "step": 3209, "time_per_iteration": 2.699798107147217 }, { "auxiliary_loss_clip": 0.01158758, "auxiliary_loss_mlp": 0.01053815, "balance_loss_clip": 1.05472994, "balance_loss_mlp": 1.03261995, "epoch": 0.19299564106418157, "flos": 40807916058240.0, "grad_norm": 1.9992177661428934, "language_loss": 0.80025005, "learning_rate": 3.727718151176243e-06, "loss": 0.82237577, "num_input_tokens_seen": 69316065, "step": 3210, "time_per_iteration": 2.832665205001831 }, { "auxiliary_loss_clip": 0.01118997, "auxiliary_loss_mlp": 0.01047494, "balance_loss_clip": 1.05044246, "balance_loss_mlp": 1.02920699, "epoch": 0.19305576431684954, "flos": 11361418634880.0, "grad_norm": 2.515510367397107, "language_loss": 0.82571948, "learning_rate": 3.7275219335013217e-06, "loss": 0.84738445, "num_input_tokens_seen": 69332900, "step": 3211, "time_per_iteration": 2.7664191722869873 }, { "auxiliary_loss_clip": 0.01073663, "auxiliary_loss_mlp": 0.01002544, "balance_loss_clip": 1.03501034, "balance_loss_mlp": 1.00021982, "epoch": 0.1931158875695175, "flos": 54511895975040.0, "grad_norm": 0.9633495631759209, "language_loss": 0.63641912, "learning_rate": 3.7273256503185953e-06, "loss": 0.6571812, "num_input_tokens_seen": 69382535, "step": 3212, "time_per_iteration": 2.974940299987793 }, { "auxiliary_loss_clip": 0.01131742, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.05586314, "balance_loss_mlp": 1.02565336, "epoch": 0.19317601082218547, "flos": 19828436365440.0, "grad_norm": 1.7209148950717332, "language_loss": 0.76375663, "learning_rate": 3.7271293016355074e-06, "loss": 0.78551459, "num_input_tokens_seen": 69400600, "step": 3213, "time_per_iteration": 2.7898454666137695 }, { "auxiliary_loss_clip": 0.01123196, "auxiliary_loss_mlp": 0.0105066, "balance_loss_clip": 1.05261111, "balance_loss_mlp": 1.03116894, "epoch": 0.19323613407485346, "flos": 13152068467200.0, "grad_norm": 2.349758973823363, "language_loss": 0.70871878, "learning_rate": 3.726932887459503e-06, "loss": 0.73045731, "num_input_tokens_seen": 69417350, "step": 3214, "time_per_iteration": 2.8155152797698975 }, { "auxiliary_loss_clip": 0.01155585, "auxiliary_loss_mlp": 0.01047831, "balance_loss_clip": 1.05412841, "balance_loss_mlp": 1.02807808, "epoch": 0.19329625732752143, "flos": 14027247342720.0, "grad_norm": 2.190607045917922, "language_loss": 0.75067955, "learning_rate": 3.72673640779803e-06, "loss": 0.77271378, "num_input_tokens_seen": 69431845, "step": 3215, "time_per_iteration": 4.111938238143921 }, { "auxiliary_loss_clip": 0.01112217, "auxiliary_loss_mlp": 0.01049964, "balance_loss_clip": 1.04928339, "balance_loss_mlp": 1.0323447, "epoch": 0.1933563805801894, "flos": 23441732069760.0, "grad_norm": 1.7842520268521305, "language_loss": 0.88426638, "learning_rate": 3.72653986265854e-06, "loss": 0.9058882, "num_input_tokens_seen": 69453275, "step": 3216, "time_per_iteration": 2.7699615955352783 }, { "auxiliary_loss_clip": 0.01153806, "auxiliary_loss_mlp": 0.01052131, "balance_loss_clip": 1.05435801, "balance_loss_mlp": 1.03442836, "epoch": 0.19341650383285736, "flos": 20485314334080.0, "grad_norm": 1.6996051239972392, "language_loss": 0.7974773, "learning_rate": 3.726343252048485e-06, "loss": 0.81953669, "num_input_tokens_seen": 69471830, "step": 3217, "time_per_iteration": 2.6788718700408936 }, { "auxiliary_loss_clip": 0.01143281, "auxiliary_loss_mlp": 0.0104914, "balance_loss_clip": 1.05695105, "balance_loss_mlp": 1.02864754, "epoch": 0.19347662708552532, "flos": 17858484817920.0, "grad_norm": 4.708784796317305, "language_loss": 0.6161437, "learning_rate": 3.7261465759753206e-06, "loss": 0.6380679, "num_input_tokens_seen": 69489320, "step": 3218, "time_per_iteration": 4.352849960327148 }, { "auxiliary_loss_clip": 0.01157355, "auxiliary_loss_mlp": 0.01047211, "balance_loss_clip": 1.05723107, "balance_loss_mlp": 1.02873373, "epoch": 0.1935367503381933, "flos": 18187247024640.0, "grad_norm": 1.9724785552136583, "language_loss": 0.80345452, "learning_rate": 3.7259498344465053e-06, "loss": 0.82550013, "num_input_tokens_seen": 69506665, "step": 3219, "time_per_iteration": 4.1739161014556885 }, { "auxiliary_loss_clip": 0.01104687, "auxiliary_loss_mlp": 0.01047672, "balance_loss_clip": 1.05145359, "balance_loss_mlp": 1.02819324, "epoch": 0.19359687359086128, "flos": 15957122290560.0, "grad_norm": 2.7508533279024077, "language_loss": 0.85693008, "learning_rate": 3.7257530274694993e-06, "loss": 0.87845367, "num_input_tokens_seen": 69523835, "step": 3220, "time_per_iteration": 2.777284622192383 }, { "auxiliary_loss_clip": 0.01149581, "auxiliary_loss_mlp": 0.01041747, "balance_loss_clip": 1.05441856, "balance_loss_mlp": 1.02511764, "epoch": 0.19365699684352924, "flos": 21215198695680.0, "grad_norm": 2.05545450883527, "language_loss": 0.84637755, "learning_rate": 3.725556155051766e-06, "loss": 0.86829084, "num_input_tokens_seen": 69542620, "step": 3221, "time_per_iteration": 4.224115371704102 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01044558, "balance_loss_clip": 1.05466259, "balance_loss_mlp": 1.02730846, "epoch": 0.1937171200961972, "flos": 17311098481920.0, "grad_norm": 2.658004231066563, "language_loss": 0.86087942, "learning_rate": 3.7253592172007702e-06, "loss": 0.8827616, "num_input_tokens_seen": 69561130, "step": 3222, "time_per_iteration": 2.6400530338287354 }, { "auxiliary_loss_clip": 0.01069453, "auxiliary_loss_mlp": 0.01045281, "balance_loss_clip": 1.04206085, "balance_loss_mlp": 1.02599275, "epoch": 0.19377724334886517, "flos": 22635968227200.0, "grad_norm": 1.8604116943694204, "language_loss": 0.78510809, "learning_rate": 3.72516221392398e-06, "loss": 0.8062554, "num_input_tokens_seen": 69580425, "step": 3223, "time_per_iteration": 2.9685652256011963 }, { "auxiliary_loss_clip": 0.01146062, "auxiliary_loss_mlp": 0.01046815, "balance_loss_clip": 1.05697751, "balance_loss_mlp": 1.02819431, "epoch": 0.19383736660153314, "flos": 15077813351040.0, "grad_norm": 1.8958208586464897, "language_loss": 0.75391948, "learning_rate": 3.7249651452288653e-06, "loss": 0.77584827, "num_input_tokens_seen": 69597085, "step": 3224, "time_per_iteration": 2.665294885635376 }, { "auxiliary_loss_clip": 0.01102293, "auxiliary_loss_mlp": 0.01050181, "balance_loss_clip": 1.04728186, "balance_loss_mlp": 1.02927208, "epoch": 0.1938974898542011, "flos": 47119934350080.0, "grad_norm": 3.358076005999295, "language_loss": 0.71180636, "learning_rate": 3.7247680111229e-06, "loss": 0.73333108, "num_input_tokens_seen": 69618885, "step": 3225, "time_per_iteration": 2.997511863708496 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01053167, "balance_loss_clip": 1.0519309, "balance_loss_mlp": 1.03480864, "epoch": 0.19395761310686907, "flos": 25812554376960.0, "grad_norm": 2.42331686427639, "language_loss": 0.69379079, "learning_rate": 3.7245708116135585e-06, "loss": 0.71552593, "num_input_tokens_seen": 69638200, "step": 3226, "time_per_iteration": 2.746338129043579 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01042276, "balance_loss_clip": 1.05692983, "balance_loss_mlp": 1.02264214, "epoch": 0.19401773635953706, "flos": 23039604334080.0, "grad_norm": 2.1006513764454864, "language_loss": 0.76236808, "learning_rate": 3.7243735467083193e-06, "loss": 0.78405869, "num_input_tokens_seen": 69657550, "step": 3227, "time_per_iteration": 2.760087728500366 }, { "auxiliary_loss_clip": 0.01117794, "auxiliary_loss_mlp": 0.010438, "balance_loss_clip": 1.05304587, "balance_loss_mlp": 1.0256561, "epoch": 0.19407785961220503, "flos": 15920780705280.0, "grad_norm": 2.8268368707906397, "language_loss": 0.69577461, "learning_rate": 3.724176216414662e-06, "loss": 0.71739054, "num_input_tokens_seen": 69675005, "step": 3228, "time_per_iteration": 2.6779348850250244 }, { "auxiliary_loss_clip": 0.01148199, "auxiliary_loss_mlp": 0.01042315, "balance_loss_clip": 1.05775642, "balance_loss_mlp": 1.02445757, "epoch": 0.194137982864873, "flos": 25921722787200.0, "grad_norm": 1.7694943420266864, "language_loss": 0.74160898, "learning_rate": 3.72397882074007e-06, "loss": 0.76351416, "num_input_tokens_seen": 69696455, "step": 3229, "time_per_iteration": 2.7229623794555664 }, { "auxiliary_loss_clip": 0.01119678, "auxiliary_loss_mlp": 0.01044155, "balance_loss_clip": 1.05435359, "balance_loss_mlp": 1.0262022, "epoch": 0.19419810611754096, "flos": 13261344618240.0, "grad_norm": 1.9766126324167548, "language_loss": 0.65722096, "learning_rate": 3.7237813596920285e-06, "loss": 0.67885935, "num_input_tokens_seen": 69714245, "step": 3230, "time_per_iteration": 2.740324020385742 }, { "auxiliary_loss_clip": 0.01124671, "auxiliary_loss_mlp": 0.00776003, "balance_loss_clip": 1.05223823, "balance_loss_mlp": 1.00081468, "epoch": 0.19425822937020892, "flos": 15705568368000.0, "grad_norm": 1.9307338208311895, "language_loss": 0.82042694, "learning_rate": 3.7235838332780254e-06, "loss": 0.83943367, "num_input_tokens_seen": 69731515, "step": 3231, "time_per_iteration": 2.7453513145446777 }, { "auxiliary_loss_clip": 0.0113141, "auxiliary_loss_mlp": 0.01042332, "balance_loss_clip": 1.05393946, "balance_loss_mlp": 1.02220988, "epoch": 0.1943183526228769, "flos": 23105392093440.0, "grad_norm": 10.866686758212083, "language_loss": 0.87038374, "learning_rate": 3.72338624150555e-06, "loss": 0.89212114, "num_input_tokens_seen": 69748885, "step": 3232, "time_per_iteration": 2.7575178146362305 }, { "auxiliary_loss_clip": 0.01100451, "auxiliary_loss_mlp": 0.01050878, "balance_loss_clip": 1.05029583, "balance_loss_mlp": 1.03102958, "epoch": 0.19437847587554485, "flos": 24712610146560.0, "grad_norm": 2.531838729905544, "language_loss": 0.85189134, "learning_rate": 3.723188584382096e-06, "loss": 0.87340462, "num_input_tokens_seen": 69767540, "step": 3233, "time_per_iteration": 2.8617444038391113 }, { "auxiliary_loss_clip": 0.01149478, "auxiliary_loss_mlp": 0.01054519, "balance_loss_clip": 1.0574832, "balance_loss_mlp": 1.0357672, "epoch": 0.19443859912821285, "flos": 23116130259840.0, "grad_norm": 1.7408859410354203, "language_loss": 0.89099532, "learning_rate": 3.722990861915158e-06, "loss": 0.91303527, "num_input_tokens_seen": 69789340, "step": 3234, "time_per_iteration": 2.7648239135742188 }, { "auxiliary_loss_clip": 0.01135157, "auxiliary_loss_mlp": 0.01044708, "balance_loss_clip": 1.05003643, "balance_loss_mlp": 1.02544403, "epoch": 0.1944987223808808, "flos": 15084385539840.0, "grad_norm": 2.4074482975555926, "language_loss": 0.78673434, "learning_rate": 3.722793074112234e-06, "loss": 0.80853301, "num_input_tokens_seen": 69806470, "step": 3235, "time_per_iteration": 2.76930832862854 }, { "auxiliary_loss_clip": 0.01136497, "auxiliary_loss_mlp": 0.01046749, "balance_loss_clip": 1.0580672, "balance_loss_mlp": 1.0293448, "epoch": 0.19455884563354878, "flos": 17126876603520.0, "grad_norm": 2.2511193258734354, "language_loss": 0.79391634, "learning_rate": 3.7225952209808233e-06, "loss": 0.81574875, "num_input_tokens_seen": 69822655, "step": 3236, "time_per_iteration": 2.7060179710388184 }, { "auxiliary_loss_clip": 0.01156991, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.05862045, "balance_loss_mlp": 1.02482522, "epoch": 0.19461896888621674, "flos": 20193396503040.0, "grad_norm": 2.1553329609131713, "language_loss": 0.76224017, "learning_rate": 3.72239730252843e-06, "loss": 0.78425646, "num_input_tokens_seen": 69841895, "step": 3237, "time_per_iteration": 2.642235040664673 }, { "auxiliary_loss_clip": 0.01158804, "auxiliary_loss_mlp": 0.01051059, "balance_loss_clip": 1.05648041, "balance_loss_mlp": 1.03289127, "epoch": 0.1946790921388847, "flos": 25301365971840.0, "grad_norm": 1.5204653275468003, "language_loss": 0.74828202, "learning_rate": 3.7221993187625583e-06, "loss": 0.77038062, "num_input_tokens_seen": 69862220, "step": 3238, "time_per_iteration": 2.6618688106536865 }, { "auxiliary_loss_clip": 0.01108331, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.04992437, "balance_loss_mlp": 1.02791595, "epoch": 0.19473921539155267, "flos": 20193396503040.0, "grad_norm": 3.1324225641798518, "language_loss": 0.734164, "learning_rate": 3.7220012696907155e-06, "loss": 0.75571299, "num_input_tokens_seen": 69881830, "step": 3239, "time_per_iteration": 2.7637152671813965 }, { "auxiliary_loss_clip": 0.01132567, "auxiliary_loss_mlp": 0.01047988, "balance_loss_clip": 1.05458641, "balance_loss_mlp": 1.02947509, "epoch": 0.19479933864422067, "flos": 20887549810560.0, "grad_norm": 2.155392951393246, "language_loss": 0.73291272, "learning_rate": 3.721803155320412e-06, "loss": 0.7547183, "num_input_tokens_seen": 69900515, "step": 3240, "time_per_iteration": 2.6980888843536377 }, { "auxiliary_loss_clip": 0.01131601, "auxiliary_loss_mlp": 0.0103943, "balance_loss_clip": 1.05846488, "balance_loss_mlp": 1.02208555, "epoch": 0.19485946189688863, "flos": 23295072839040.0, "grad_norm": 5.847648280625993, "language_loss": 0.65809447, "learning_rate": 3.7216049756591606e-06, "loss": 0.6798048, "num_input_tokens_seen": 69920060, "step": 3241, "time_per_iteration": 2.659707546234131 }, { "auxiliary_loss_clip": 0.01128971, "auxiliary_loss_mlp": 0.01048707, "balance_loss_clip": 1.05226684, "balance_loss_mlp": 1.03039646, "epoch": 0.1949195851495566, "flos": 23295036925440.0, "grad_norm": 1.4408225707306088, "language_loss": 0.82747853, "learning_rate": 3.7214067307144754e-06, "loss": 0.84925532, "num_input_tokens_seen": 69939820, "step": 3242, "time_per_iteration": 2.7137632369995117 }, { "auxiliary_loss_clip": 0.01077632, "auxiliary_loss_mlp": 0.01014225, "balance_loss_clip": 1.04083347, "balance_loss_mlp": 1.01131678, "epoch": 0.19497970840222456, "flos": 64962871557120.0, "grad_norm": 0.853263603243422, "language_loss": 0.57500821, "learning_rate": 3.721208420493875e-06, "loss": 0.59592682, "num_input_tokens_seen": 70002145, "step": 3243, "time_per_iteration": 3.1446309089660645 }, { "auxiliary_loss_clip": 0.01138548, "auxiliary_loss_mlp": 0.01050428, "balance_loss_clip": 1.05331421, "balance_loss_mlp": 1.02988815, "epoch": 0.19503983165489253, "flos": 19644717277440.0, "grad_norm": 7.2345723863132, "language_loss": 0.83789021, "learning_rate": 3.7210100450048784e-06, "loss": 0.85977995, "num_input_tokens_seen": 70020510, "step": 3244, "time_per_iteration": 2.6194229125976562 }, { "auxiliary_loss_clip": 0.01143261, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.05732584, "balance_loss_mlp": 1.02869976, "epoch": 0.1950999549075605, "flos": 21141976821120.0, "grad_norm": 2.0710390949438837, "language_loss": 0.7739507, "learning_rate": 3.7208116042550088e-06, "loss": 0.79584551, "num_input_tokens_seen": 70040760, "step": 3245, "time_per_iteration": 2.6684374809265137 }, { "auxiliary_loss_clip": 0.01142874, "auxiliary_loss_mlp": 0.01043114, "balance_loss_clip": 1.05566645, "balance_loss_mlp": 1.02431464, "epoch": 0.19516007816022846, "flos": 20884820376960.0, "grad_norm": 2.1010289547443133, "language_loss": 0.83988321, "learning_rate": 3.7206130982517906e-06, "loss": 0.86174309, "num_input_tokens_seen": 70058720, "step": 3246, "time_per_iteration": 2.6595354080200195 }, { "auxiliary_loss_clip": 0.0114599, "auxiliary_loss_mlp": 0.00776442, "balance_loss_clip": 1.05517101, "balance_loss_mlp": 1.00080454, "epoch": 0.19522020141289645, "flos": 16910515031040.0, "grad_norm": 3.3581015873305438, "language_loss": 0.76840878, "learning_rate": 3.7204145270027514e-06, "loss": 0.78763306, "num_input_tokens_seen": 70076470, "step": 3247, "time_per_iteration": 2.7777793407440186 }, { "auxiliary_loss_clip": 0.01121778, "auxiliary_loss_mlp": 0.01043977, "balance_loss_clip": 1.05689096, "balance_loss_mlp": 1.02651262, "epoch": 0.19528032466556441, "flos": 26724829023360.0, "grad_norm": 1.8981807103962522, "language_loss": 0.75459039, "learning_rate": 3.720215890515421e-06, "loss": 0.77624786, "num_input_tokens_seen": 70096220, "step": 3248, "time_per_iteration": 2.8088901042938232 }, { "auxiliary_loss_clip": 0.01156017, "auxiliary_loss_mlp": 0.01048303, "balance_loss_clip": 1.05548215, "balance_loss_mlp": 1.03008783, "epoch": 0.19534044791823238, "flos": 21032808410880.0, "grad_norm": 2.7209722336942135, "language_loss": 0.77774823, "learning_rate": 3.7200171887973316e-06, "loss": 0.79979146, "num_input_tokens_seen": 70114800, "step": 3249, "time_per_iteration": 2.610877752304077 }, { "auxiliary_loss_clip": 0.01148434, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.05689144, "balance_loss_mlp": 1.03299928, "epoch": 0.19540057117090034, "flos": 22344050396160.0, "grad_norm": 1.5551573885822045, "language_loss": 0.73118901, "learning_rate": 3.7198184218560176e-06, "loss": 0.75317669, "num_input_tokens_seen": 70134930, "step": 3250, "time_per_iteration": 2.5901567935943604 }, { "auxiliary_loss_clip": 0.01101628, "auxiliary_loss_mlp": 0.01046467, "balance_loss_clip": 1.05080378, "balance_loss_mlp": 1.02876413, "epoch": 0.1954606944235683, "flos": 20301631159680.0, "grad_norm": 2.030501302548557, "language_loss": 0.79203367, "learning_rate": 3.719619589699017e-06, "loss": 0.81351459, "num_input_tokens_seen": 70152045, "step": 3251, "time_per_iteration": 2.6619749069213867 }, { "auxiliary_loss_clip": 0.0115825, "auxiliary_loss_mlp": 0.01044132, "balance_loss_clip": 1.05741858, "balance_loss_mlp": 1.02606022, "epoch": 0.19552081767623627, "flos": 17346865449600.0, "grad_norm": 7.451515078679223, "language_loss": 0.83871722, "learning_rate": 3.7194206923338695e-06, "loss": 0.86074108, "num_input_tokens_seen": 70169240, "step": 3252, "time_per_iteration": 2.5029656887054443 }, { "auxiliary_loss_clip": 0.01142752, "auxiliary_loss_mlp": 0.01057294, "balance_loss_clip": 1.05278862, "balance_loss_mlp": 1.03518057, "epoch": 0.19558094092890424, "flos": 31977626129280.0, "grad_norm": 1.7140417843701068, "language_loss": 0.73995864, "learning_rate": 3.719221729768117e-06, "loss": 0.76195908, "num_input_tokens_seen": 70192690, "step": 3253, "time_per_iteration": 2.609117269515991 }, { "auxiliary_loss_clip": 0.01102675, "auxiliary_loss_mlp": 0.01046707, "balance_loss_clip": 1.04759037, "balance_loss_mlp": 1.02782381, "epoch": 0.19564106418157223, "flos": 22268889187200.0, "grad_norm": 2.1302159220485675, "language_loss": 0.76167047, "learning_rate": 3.7190227020093037e-06, "loss": 0.78316426, "num_input_tokens_seen": 70209685, "step": 3254, "time_per_iteration": 4.174965858459473 }, { "auxiliary_loss_clip": 0.01043127, "auxiliary_loss_mlp": 0.01006966, "balance_loss_clip": 1.04737842, "balance_loss_mlp": 1.0036757, "epoch": 0.1957011874342402, "flos": 54364554385920.0, "grad_norm": 0.84452007287803, "language_loss": 0.55275303, "learning_rate": 3.7188236090649774e-06, "loss": 0.57325399, "num_input_tokens_seen": 70265050, "step": 3255, "time_per_iteration": 3.2241716384887695 }, { "auxiliary_loss_clip": 0.01133721, "auxiliary_loss_mlp": 0.01041696, "balance_loss_clip": 1.0557251, "balance_loss_mlp": 1.02349281, "epoch": 0.19576131068690816, "flos": 16506699356160.0, "grad_norm": 2.6103802859468392, "language_loss": 0.70870697, "learning_rate": 3.718624450942688e-06, "loss": 0.73046112, "num_input_tokens_seen": 70281830, "step": 3256, "time_per_iteration": 2.641296148300171 }, { "auxiliary_loss_clip": 0.01152768, "auxiliary_loss_mlp": 0.01042867, "balance_loss_clip": 1.0544858, "balance_loss_mlp": 1.02523613, "epoch": 0.19582143393957613, "flos": 14719676797440.0, "grad_norm": 2.649319646209249, "language_loss": 0.80722409, "learning_rate": 3.718425227649987e-06, "loss": 0.82918048, "num_input_tokens_seen": 70297420, "step": 3257, "time_per_iteration": 4.258259057998657 }, { "auxiliary_loss_clip": 0.01106644, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.05470431, "balance_loss_mlp": 1.02601588, "epoch": 0.1958815571922441, "flos": 24425504737920.0, "grad_norm": 6.015808523610408, "language_loss": 0.75124931, "learning_rate": 3.7182259391944292e-06, "loss": 0.77274966, "num_input_tokens_seen": 70319210, "step": 3258, "time_per_iteration": 4.386433362960815 }, { "auxiliary_loss_clip": 0.01082287, "auxiliary_loss_mlp": 0.01044148, "balance_loss_clip": 1.04533339, "balance_loss_mlp": 1.0237875, "epoch": 0.19594168044491206, "flos": 24900279730560.0, "grad_norm": 1.8034996675319444, "language_loss": 0.73872411, "learning_rate": 3.7180265855835714e-06, "loss": 0.75998843, "num_input_tokens_seen": 70339045, "step": 3259, "time_per_iteration": 2.815469264984131 }, { "auxiliary_loss_clip": 0.01131793, "auxiliary_loss_mlp": 0.01043364, "balance_loss_clip": 1.05167735, "balance_loss_mlp": 1.02392125, "epoch": 0.19600180369758005, "flos": 12057008486400.0, "grad_norm": 2.2096667980592, "language_loss": 0.77053022, "learning_rate": 3.7178271668249735e-06, "loss": 0.79228187, "num_input_tokens_seen": 70356505, "step": 3260, "time_per_iteration": 4.2817702293396 }, { "auxiliary_loss_clip": 0.01148118, "auxiliary_loss_mlp": 0.01043761, "balance_loss_clip": 1.0551343, "balance_loss_mlp": 1.0248661, "epoch": 0.19606192695024802, "flos": 20850202644480.0, "grad_norm": 5.605178759176999, "language_loss": 0.82261205, "learning_rate": 3.7176276829261975e-06, "loss": 0.84453082, "num_input_tokens_seen": 70375410, "step": 3261, "time_per_iteration": 2.673092842102051 }, { "auxiliary_loss_clip": 0.01121379, "auxiliary_loss_mlp": 0.01044043, "balance_loss_clip": 1.0550617, "balance_loss_mlp": 1.02488637, "epoch": 0.19612205020291598, "flos": 28475509996800.0, "grad_norm": 1.8492209450679535, "language_loss": 0.76671481, "learning_rate": 3.717428133894807e-06, "loss": 0.78836906, "num_input_tokens_seen": 70396315, "step": 3262, "time_per_iteration": 2.803938150405884 }, { "auxiliary_loss_clip": 0.01148893, "auxiliary_loss_mlp": 0.01047259, "balance_loss_clip": 1.05960584, "balance_loss_mlp": 1.02950907, "epoch": 0.19618217345558395, "flos": 25556618995200.0, "grad_norm": 1.7278621785184562, "language_loss": 0.8668195, "learning_rate": 3.71722851973837e-06, "loss": 0.88878107, "num_input_tokens_seen": 70417945, "step": 3263, "time_per_iteration": 2.6677918434143066 }, { "auxiliary_loss_clip": 0.0113123, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.05328059, "balance_loss_mlp": 1.02505815, "epoch": 0.1962422967082519, "flos": 25264413855360.0, "grad_norm": 3.447639973868791, "language_loss": 0.73775035, "learning_rate": 3.717028840464455e-06, "loss": 0.75948811, "num_input_tokens_seen": 70438690, "step": 3264, "time_per_iteration": 2.6973094940185547 }, { "auxiliary_loss_clip": 0.01144053, "auxiliary_loss_mlp": 0.01049918, "balance_loss_clip": 1.05736756, "balance_loss_mlp": 1.03223944, "epoch": 0.19630241996091988, "flos": 18807352444800.0, "grad_norm": 2.4424358562200927, "language_loss": 0.78513813, "learning_rate": 3.7168290960806344e-06, "loss": 0.80707777, "num_input_tokens_seen": 70455385, "step": 3265, "time_per_iteration": 2.625739336013794 }, { "auxiliary_loss_clip": 0.01031434, "auxiliary_loss_mlp": 0.01002481, "balance_loss_clip": 1.03386986, "balance_loss_mlp": 0.99983466, "epoch": 0.19636254321358784, "flos": 62321137896960.0, "grad_norm": 0.7932330660809486, "language_loss": 0.53389955, "learning_rate": 3.716629286594483e-06, "loss": 0.55423868, "num_input_tokens_seen": 70514280, "step": 3266, "time_per_iteration": 3.2586586475372314 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.00776501, "balance_loss_clip": 1.04957044, "balance_loss_mlp": 1.00080895, "epoch": 0.19642266646625584, "flos": 21069329564160.0, "grad_norm": 2.0008611208986133, "language_loss": 0.80109024, "learning_rate": 3.7164294120135767e-06, "loss": 0.8201015, "num_input_tokens_seen": 70531800, "step": 3267, "time_per_iteration": 2.678537368774414 }, { "auxiliary_loss_clip": 0.01130982, "auxiliary_loss_mlp": 0.01043983, "balance_loss_clip": 1.05263019, "balance_loss_mlp": 1.02660179, "epoch": 0.1964827897189238, "flos": 14538651229440.0, "grad_norm": 1.9909459598185588, "language_loss": 0.86758262, "learning_rate": 3.7162294723454953e-06, "loss": 0.88933229, "num_input_tokens_seen": 70550615, "step": 3268, "time_per_iteration": 2.6949849128723145 }, { "auxiliary_loss_clip": 0.01099432, "auxiliary_loss_mlp": 0.01041621, "balance_loss_clip": 1.04954004, "balance_loss_mlp": 1.02408528, "epoch": 0.19654291297159177, "flos": 19244636616960.0, "grad_norm": 2.2632495429204127, "language_loss": 0.68785441, "learning_rate": 3.7160294675978197e-06, "loss": 0.70926493, "num_input_tokens_seen": 70568690, "step": 3269, "time_per_iteration": 2.770078182220459 }, { "auxiliary_loss_clip": 0.01116538, "auxiliary_loss_mlp": 0.01052319, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.03330541, "epoch": 0.19660303622425973, "flos": 25775710001280.0, "grad_norm": 7.1863103423452355, "language_loss": 0.80241841, "learning_rate": 3.715829397778135e-06, "loss": 0.82410699, "num_input_tokens_seen": 70588665, "step": 3270, "time_per_iteration": 2.7294864654541016 }, { "auxiliary_loss_clip": 0.01139501, "auxiliary_loss_mlp": 0.01045694, "balance_loss_clip": 1.05189824, "balance_loss_mlp": 1.02833724, "epoch": 0.1966631594769277, "flos": 20595093275520.0, "grad_norm": 1.9668649321541274, "language_loss": 0.83912349, "learning_rate": 3.715629262894028e-06, "loss": 0.86097538, "num_input_tokens_seen": 70606900, "step": 3271, "time_per_iteration": 2.640235662460327 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01051303, "balance_loss_clip": 1.05468225, "balance_loss_mlp": 1.0332067, "epoch": 0.19672328272959566, "flos": 23623188600960.0, "grad_norm": 1.9968416702279483, "language_loss": 0.79902714, "learning_rate": 3.715429062953087e-06, "loss": 0.82093388, "num_input_tokens_seen": 70625955, "step": 3272, "time_per_iteration": 2.636629343032837 }, { "auxiliary_loss_clip": 0.01124328, "auxiliary_loss_mlp": 0.01058493, "balance_loss_clip": 1.05192566, "balance_loss_mlp": 1.03715479, "epoch": 0.19678340598226365, "flos": 23110922787840.0, "grad_norm": 1.7302013075823783, "language_loss": 0.80942369, "learning_rate": 3.7152287979629043e-06, "loss": 0.83125186, "num_input_tokens_seen": 70646090, "step": 3273, "time_per_iteration": 2.6967809200286865 }, { "auxiliary_loss_clip": 0.01144024, "auxiliary_loss_mlp": 0.01054564, "balance_loss_clip": 1.05456042, "balance_loss_mlp": 1.03655195, "epoch": 0.19684352923493162, "flos": 24534852716160.0, "grad_norm": 2.225126358921887, "language_loss": 0.77984649, "learning_rate": 3.7150284679310735e-06, "loss": 0.80183232, "num_input_tokens_seen": 70666065, "step": 3274, "time_per_iteration": 2.6808643341064453 }, { "auxiliary_loss_clip": 0.01141267, "auxiliary_loss_mlp": 0.01046445, "balance_loss_clip": 1.05480242, "balance_loss_mlp": 1.02840877, "epoch": 0.19690365248759958, "flos": 21796448578560.0, "grad_norm": 2.318697297640889, "language_loss": 0.81433225, "learning_rate": 3.7148280728651914e-06, "loss": 0.8362093, "num_input_tokens_seen": 70681580, "step": 3275, "time_per_iteration": 2.672672986984253 }, { "auxiliary_loss_clip": 0.01115756, "auxiliary_loss_mlp": 0.01045314, "balance_loss_clip": 1.05148947, "balance_loss_mlp": 1.02686024, "epoch": 0.19696377574026755, "flos": 19056643810560.0, "grad_norm": 2.4665004531377166, "language_loss": 0.80909657, "learning_rate": 3.7146276127728563e-06, "loss": 0.83070731, "num_input_tokens_seen": 70697745, "step": 3276, "time_per_iteration": 2.726970672607422 }, { "auxiliary_loss_clip": 0.01142619, "auxiliary_loss_mlp": 0.01043042, "balance_loss_clip": 1.05443609, "balance_loss_mlp": 1.02491045, "epoch": 0.19702389899293551, "flos": 22820656982400.0, "grad_norm": 2.17541075016206, "language_loss": 0.89113599, "learning_rate": 3.7144270876616713e-06, "loss": 0.9129926, "num_input_tokens_seen": 70715110, "step": 3277, "time_per_iteration": 2.6738827228546143 }, { "auxiliary_loss_clip": 0.01103709, "auxiliary_loss_mlp": 0.01048433, "balance_loss_clip": 1.04638815, "balance_loss_mlp": 1.02864444, "epoch": 0.19708402224560348, "flos": 22894237992960.0, "grad_norm": 2.640727897616601, "language_loss": 0.62070847, "learning_rate": 3.714226497539239e-06, "loss": 0.64222991, "num_input_tokens_seen": 70734715, "step": 3278, "time_per_iteration": 2.7382938861846924 }, { "auxiliary_loss_clip": 0.01115303, "auxiliary_loss_mlp": 0.0105759, "balance_loss_clip": 1.05033016, "balance_loss_mlp": 1.03793263, "epoch": 0.19714414549827144, "flos": 25662519267840.0, "grad_norm": 1.930104581155035, "language_loss": 0.73606467, "learning_rate": 3.714025842413166e-06, "loss": 0.75779366, "num_input_tokens_seen": 70752650, "step": 3279, "time_per_iteration": 2.8123648166656494 }, { "auxiliary_loss_clip": 0.0114648, "auxiliary_loss_mlp": 0.01042853, "balance_loss_clip": 1.05422091, "balance_loss_mlp": 1.02567458, "epoch": 0.19720426875093944, "flos": 23915824704000.0, "grad_norm": 1.7034036878345749, "language_loss": 0.82685816, "learning_rate": 3.713825122291061e-06, "loss": 0.84875143, "num_input_tokens_seen": 70772365, "step": 3280, "time_per_iteration": 2.7000861167907715 }, { "auxiliary_loss_clip": 0.01106655, "auxiliary_loss_mlp": 0.01048884, "balance_loss_clip": 1.04887283, "balance_loss_mlp": 1.03071654, "epoch": 0.1972643920036074, "flos": 13881952828800.0, "grad_norm": 2.435959864664923, "language_loss": 0.78173983, "learning_rate": 3.713624337180536e-06, "loss": 0.80329525, "num_input_tokens_seen": 70790340, "step": 3281, "time_per_iteration": 2.7017247676849365 }, { "auxiliary_loss_clip": 0.01125353, "auxiliary_loss_mlp": 0.0104135, "balance_loss_clip": 1.05461836, "balance_loss_mlp": 1.02519727, "epoch": 0.19732451525627537, "flos": 19863592801920.0, "grad_norm": 1.7390973872526612, "language_loss": 0.79777479, "learning_rate": 3.7134234870892045e-06, "loss": 0.8194418, "num_input_tokens_seen": 70809295, "step": 3282, "time_per_iteration": 2.7064146995544434 }, { "auxiliary_loss_clip": 0.01112073, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.05485284, "balance_loss_mlp": 1.02538049, "epoch": 0.19738463850894333, "flos": 24973429777920.0, "grad_norm": 2.512566515566025, "language_loss": 0.7192747, "learning_rate": 3.7132225720246826e-06, "loss": 0.74082589, "num_input_tokens_seen": 70828765, "step": 3283, "time_per_iteration": 2.775297164916992 }, { "auxiliary_loss_clip": 0.01137498, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.05320621, "balance_loss_mlp": 1.02665281, "epoch": 0.1974447617616113, "flos": 18368883123840.0, "grad_norm": 1.8864815757917637, "language_loss": 0.78981179, "learning_rate": 3.7130215919945886e-06, "loss": 0.81162113, "num_input_tokens_seen": 70846805, "step": 3284, "time_per_iteration": 2.6344916820526123 }, { "auxiliary_loss_clip": 0.01126512, "auxiliary_loss_mlp": 0.00776821, "balance_loss_clip": 1.05065584, "balance_loss_mlp": 1.00114048, "epoch": 0.19750488501427926, "flos": 22892945103360.0, "grad_norm": 2.1903874509936982, "language_loss": 0.86317503, "learning_rate": 3.7128205470065445e-06, "loss": 0.88220835, "num_input_tokens_seen": 70863805, "step": 3285, "time_per_iteration": 2.725186586380005 }, { "auxiliary_loss_clip": 0.01115791, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.05167055, "balance_loss_mlp": 1.02658761, "epoch": 0.19756500826694723, "flos": 21871502046720.0, "grad_norm": 2.208260347555195, "language_loss": 0.88770825, "learning_rate": 3.712619437068174e-06, "loss": 0.90931326, "num_input_tokens_seen": 70882660, "step": 3286, "time_per_iteration": 2.6819698810577393 }, { "auxiliary_loss_clip": 0.01118742, "auxiliary_loss_mlp": 0.01052526, "balance_loss_clip": 1.05227792, "balance_loss_mlp": 1.03016233, "epoch": 0.19762513151961522, "flos": 15158972131200.0, "grad_norm": 2.0768117117784874, "language_loss": 0.77941382, "learning_rate": 3.712418262187102e-06, "loss": 0.80112648, "num_input_tokens_seen": 70898765, "step": 3287, "time_per_iteration": 2.641193389892578 }, { "auxiliary_loss_clip": 0.01127955, "auxiliary_loss_mlp": 0.01047337, "balance_loss_clip": 1.0526104, "balance_loss_mlp": 1.02849019, "epoch": 0.1976852547722832, "flos": 16979175878400.0, "grad_norm": 2.061421898899755, "language_loss": 0.80853081, "learning_rate": 3.7122170223709584e-06, "loss": 0.83028376, "num_input_tokens_seen": 70916370, "step": 3288, "time_per_iteration": 2.625068426132202 }, { "auxiliary_loss_clip": 0.01132408, "auxiliary_loss_mlp": 0.01048194, "balance_loss_clip": 1.05143857, "balance_loss_mlp": 1.03045535, "epoch": 0.19774537802495115, "flos": 20302924049280.0, "grad_norm": 2.345717890688315, "language_loss": 0.7317158, "learning_rate": 3.712015717627374e-06, "loss": 0.75352174, "num_input_tokens_seen": 70934870, "step": 3289, "time_per_iteration": 2.6319406032562256 }, { "auxiliary_loss_clip": 0.01133413, "auxiliary_loss_mlp": 0.01045224, "balance_loss_clip": 1.05575252, "balance_loss_mlp": 1.02678204, "epoch": 0.19780550127761912, "flos": 27235478724480.0, "grad_norm": 1.9087552003653308, "language_loss": 0.79608113, "learning_rate": 3.7118143479639813e-06, "loss": 0.81786746, "num_input_tokens_seen": 70955140, "step": 3290, "time_per_iteration": 2.706570863723755 }, { "auxiliary_loss_clip": 0.01049926, "auxiliary_loss_mlp": 0.0101105, "balance_loss_clip": 1.0327636, "balance_loss_mlp": 1.00853467, "epoch": 0.19786562453028708, "flos": 63550972684800.0, "grad_norm": 0.8952067644857119, "language_loss": 0.60318571, "learning_rate": 3.711612913388418e-06, "loss": 0.62379545, "num_input_tokens_seen": 71012005, "step": 3291, "time_per_iteration": 3.2849009037017822 }, { "auxiliary_loss_clip": 0.01158891, "auxiliary_loss_mlp": 0.01040785, "balance_loss_clip": 1.05417156, "balance_loss_mlp": 1.02088892, "epoch": 0.19792574778295505, "flos": 26286647011200.0, "grad_norm": 1.932789926440358, "language_loss": 0.81595641, "learning_rate": 3.7114114139083204e-06, "loss": 0.83795315, "num_input_tokens_seen": 71031140, "step": 3292, "time_per_iteration": 2.6751551628112793 }, { "auxiliary_loss_clip": 0.01119797, "auxiliary_loss_mlp": 0.00778082, "balance_loss_clip": 1.05296063, "balance_loss_mlp": 1.00086236, "epoch": 0.19798587103562304, "flos": 19938107566080.0, "grad_norm": 2.409042629875397, "language_loss": 0.81013, "learning_rate": 3.7112098495313313e-06, "loss": 0.82910883, "num_input_tokens_seen": 71050250, "step": 3293, "time_per_iteration": 4.3039703369140625 }, { "auxiliary_loss_clip": 0.01137316, "auxiliary_loss_mlp": 0.01052434, "balance_loss_clip": 1.05370128, "balance_loss_mlp": 1.03277683, "epoch": 0.198045994288291, "flos": 20120282369280.0, "grad_norm": 1.8764131105986912, "language_loss": 0.61480314, "learning_rate": 3.711008220265093e-06, "loss": 0.63670063, "num_input_tokens_seen": 71068665, "step": 3294, "time_per_iteration": 2.671241044998169 }, { "auxiliary_loss_clip": 0.01132208, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.05456376, "balance_loss_mlp": 1.02201271, "epoch": 0.19810611754095897, "flos": 17967653228160.0, "grad_norm": 2.0334748560156393, "language_loss": 0.87313825, "learning_rate": 3.710806526117251e-06, "loss": 0.89486015, "num_input_tokens_seen": 71085320, "step": 3295, "time_per_iteration": 2.659680128097534 }, { "auxiliary_loss_clip": 0.01113106, "auxiliary_loss_mlp": 0.01050184, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03256536, "epoch": 0.19816624079362694, "flos": 15084996071040.0, "grad_norm": 2.5215255479345067, "language_loss": 0.80839241, "learning_rate": 3.7106047670954544e-06, "loss": 0.83002532, "num_input_tokens_seen": 71102020, "step": 3296, "time_per_iteration": 4.299339294433594 }, { "auxiliary_loss_clip": 0.01123906, "auxiliary_loss_mlp": 0.01045438, "balance_loss_clip": 1.05233586, "balance_loss_mlp": 1.02522039, "epoch": 0.1982263640462949, "flos": 24900315644160.0, "grad_norm": 2.528943220563754, "language_loss": 0.68126047, "learning_rate": 3.710402943207354e-06, "loss": 0.70295388, "num_input_tokens_seen": 71123390, "step": 3297, "time_per_iteration": 4.258284091949463 }, { "auxiliary_loss_clip": 0.01153129, "auxiliary_loss_mlp": 0.01037574, "balance_loss_clip": 1.05660713, "balance_loss_mlp": 1.02031219, "epoch": 0.19828648729896287, "flos": 20376181837440.0, "grad_norm": 1.9083451106828888, "language_loss": 0.81310993, "learning_rate": 3.7102010544606016e-06, "loss": 0.83501697, "num_input_tokens_seen": 71141800, "step": 3298, "time_per_iteration": 2.6156656742095947 }, { "auxiliary_loss_clip": 0.01137409, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.0573976, "balance_loss_mlp": 1.02159238, "epoch": 0.19834661055163083, "flos": 18880035615360.0, "grad_norm": 1.8996943203321497, "language_loss": 0.85154539, "learning_rate": 3.7099991008628544e-06, "loss": 0.87334174, "num_input_tokens_seen": 71159505, "step": 3299, "time_per_iteration": 2.6749041080474854 }, { "auxiliary_loss_clip": 0.01036953, "auxiliary_loss_mlp": 0.01013935, "balance_loss_clip": 1.02875936, "balance_loss_mlp": 1.01106215, "epoch": 0.19840673380429882, "flos": 60259184640000.0, "grad_norm": 0.82907550606663, "language_loss": 0.53206414, "learning_rate": 3.7097970824217706e-06, "loss": 0.55257303, "num_input_tokens_seen": 71223265, "step": 3300, "time_per_iteration": 4.83857798576355 }, { "auxiliary_loss_clip": 0.01105122, "auxiliary_loss_mlp": 0.01064471, "balance_loss_clip": 1.04748702, "balance_loss_mlp": 1.0410459, "epoch": 0.1984668570569668, "flos": 19902017376000.0, "grad_norm": 316.1702389408657, "language_loss": 0.73014295, "learning_rate": 3.7095949991450093e-06, "loss": 0.75183886, "num_input_tokens_seen": 71242385, "step": 3301, "time_per_iteration": 2.700654983520508 }, { "auxiliary_loss_clip": 0.01118926, "auxiliary_loss_mlp": 0.01044315, "balance_loss_clip": 1.05295372, "balance_loss_mlp": 1.02619529, "epoch": 0.19852698030963475, "flos": 15630766295040.0, "grad_norm": 2.410718710355122, "language_loss": 0.88264418, "learning_rate": 3.709392851040235e-06, "loss": 0.90427655, "num_input_tokens_seen": 71258990, "step": 3302, "time_per_iteration": 2.7190146446228027 }, { "auxiliary_loss_clip": 0.01118067, "auxiliary_loss_mlp": 0.01045078, "balance_loss_clip": 1.05155802, "balance_loss_mlp": 1.02661204, "epoch": 0.19858710356230272, "flos": 43143007311360.0, "grad_norm": 2.210364764996701, "language_loss": 0.73592931, "learning_rate": 3.709190638115111e-06, "loss": 0.75756073, "num_input_tokens_seen": 71282770, "step": 3303, "time_per_iteration": 2.9379186630249023 }, { "auxiliary_loss_clip": 0.01143275, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.05491257, "balance_loss_mlp": 1.03002524, "epoch": 0.19864722681497068, "flos": 35144084643840.0, "grad_norm": 1.9482807590384623, "language_loss": 0.75103521, "learning_rate": 3.7089883603773084e-06, "loss": 0.77295315, "num_input_tokens_seen": 71301410, "step": 3304, "time_per_iteration": 2.743474245071411 }, { "auxiliary_loss_clip": 0.01133571, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.05309725, "balance_loss_mlp": 1.01710188, "epoch": 0.19870735006763865, "flos": 19426200888960.0, "grad_norm": 1.8722016114425952, "language_loss": 0.8628391, "learning_rate": 3.7087860178344955e-06, "loss": 0.8845247, "num_input_tokens_seen": 71319670, "step": 3305, "time_per_iteration": 2.7129390239715576 }, { "auxiliary_loss_clip": 0.01128329, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.04770195, "balance_loss_mlp": 1.02603281, "epoch": 0.19876747332030664, "flos": 23547380947200.0, "grad_norm": 2.9829227362861106, "language_loss": 0.68476367, "learning_rate": 3.7085836104943445e-06, "loss": 0.70648777, "num_input_tokens_seen": 71339850, "step": 3306, "time_per_iteration": 2.7083208560943604 }, { "auxiliary_loss_clip": 0.01119386, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.04822719, "balance_loss_mlp": 1.02168787, "epoch": 0.1988275965729746, "flos": 19829406032640.0, "grad_norm": 1.683647244561179, "language_loss": 0.76433122, "learning_rate": 3.7083811383645332e-06, "loss": 0.78591287, "num_input_tokens_seen": 71359795, "step": 3307, "time_per_iteration": 2.728661298751831 }, { "auxiliary_loss_clip": 0.01157548, "auxiliary_loss_mlp": 0.01044665, "balance_loss_clip": 1.05895782, "balance_loss_mlp": 1.02714145, "epoch": 0.19888771982564257, "flos": 23513625141120.0, "grad_norm": 2.438172575069382, "language_loss": 0.75991976, "learning_rate": 3.708178601452737e-06, "loss": 0.78194201, "num_input_tokens_seen": 71378885, "step": 3308, "time_per_iteration": 2.6580557823181152 }, { "auxiliary_loss_clip": 0.01107283, "auxiliary_loss_mlp": 0.01041656, "balance_loss_clip": 1.05453563, "balance_loss_mlp": 1.02307141, "epoch": 0.19894784307831054, "flos": 18150510389760.0, "grad_norm": 1.928689575161362, "language_loss": 0.76043576, "learning_rate": 3.7079759997666374e-06, "loss": 0.7819252, "num_input_tokens_seen": 71397285, "step": 3309, "time_per_iteration": 2.77226185798645 }, { "auxiliary_loss_clip": 0.0114115, "auxiliary_loss_mlp": 0.01045061, "balance_loss_clip": 1.05222607, "balance_loss_mlp": 1.02592754, "epoch": 0.1990079663309785, "flos": 24276044246400.0, "grad_norm": 75.17312936609292, "language_loss": 0.87855697, "learning_rate": 3.707773333313917e-06, "loss": 0.90041906, "num_input_tokens_seen": 71415775, "step": 3310, "time_per_iteration": 2.6789662837982178 }, { "auxiliary_loss_clip": 0.01153037, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.05415869, "balance_loss_mlp": 1.02139854, "epoch": 0.19906808958364647, "flos": 34897666366080.0, "grad_norm": 2.3155756588664342, "language_loss": 0.63650048, "learning_rate": 3.70757060210226e-06, "loss": 0.6584295, "num_input_tokens_seen": 71437315, "step": 3311, "time_per_iteration": 2.7604620456695557 }, { "auxiliary_loss_clip": 0.01115133, "auxiliary_loss_mlp": 0.01043871, "balance_loss_clip": 1.04763019, "balance_loss_mlp": 1.02501202, "epoch": 0.19912821283631443, "flos": 24024885373440.0, "grad_norm": 3.8064295514597717, "language_loss": 0.74542546, "learning_rate": 3.707367806139355e-06, "loss": 0.76701546, "num_input_tokens_seen": 71456320, "step": 3312, "time_per_iteration": 2.796475410461426 }, { "auxiliary_loss_clip": 0.01141587, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.05358124, "balance_loss_mlp": 1.02017355, "epoch": 0.19918833608898243, "flos": 19859031774720.0, "grad_norm": 2.2312990164825943, "language_loss": 0.84033173, "learning_rate": 3.7071649454328915e-06, "loss": 0.86212194, "num_input_tokens_seen": 71475360, "step": 3313, "time_per_iteration": 2.6044952869415283 }, { "auxiliary_loss_clip": 0.01146797, "auxiliary_loss_mlp": 0.01042166, "balance_loss_clip": 1.05695391, "balance_loss_mlp": 1.02422476, "epoch": 0.1992484593416504, "flos": 29095794984960.0, "grad_norm": 3.856678450124864, "language_loss": 0.810305, "learning_rate": 3.7069620199905625e-06, "loss": 0.83219463, "num_input_tokens_seen": 71496155, "step": 3314, "time_per_iteration": 2.68841814994812 }, { "auxiliary_loss_clip": 0.01112846, "auxiliary_loss_mlp": 0.01043677, "balance_loss_clip": 1.04617178, "balance_loss_mlp": 1.02643955, "epoch": 0.19930858259431836, "flos": 23295001011840.0, "grad_norm": 1.4822079401394097, "language_loss": 0.87391549, "learning_rate": 3.7067590298200627e-06, "loss": 0.89548075, "num_input_tokens_seen": 71517295, "step": 3315, "time_per_iteration": 2.720093011856079 }, { "auxiliary_loss_clip": 0.0111589, "auxiliary_loss_mlp": 0.00777002, "balance_loss_clip": 1.04992676, "balance_loss_mlp": 1.00093687, "epoch": 0.19936870584698632, "flos": 25378825651200.0, "grad_norm": 1.7805516248937883, "language_loss": 0.70957202, "learning_rate": 3.7065559749290892e-06, "loss": 0.72850096, "num_input_tokens_seen": 71540000, "step": 3316, "time_per_iteration": 2.850100517272949 }, { "auxiliary_loss_clip": 0.01019745, "auxiliary_loss_mlp": 0.01012504, "balance_loss_clip": 1.03032303, "balance_loss_mlp": 1.01003671, "epoch": 0.1994288290996543, "flos": 62168053109760.0, "grad_norm": 0.8326978726055106, "language_loss": 0.66287398, "learning_rate": 3.706352855325342e-06, "loss": 0.68319643, "num_input_tokens_seen": 71607880, "step": 3317, "time_per_iteration": 3.425114870071411 }, { "auxiliary_loss_clip": 0.01148059, "auxiliary_loss_mlp": 0.01048913, "balance_loss_clip": 1.05397809, "balance_loss_mlp": 1.02964854, "epoch": 0.19948895235232225, "flos": 19025832919680.0, "grad_norm": 2.282515690517884, "language_loss": 0.74494618, "learning_rate": 3.7061496710165233e-06, "loss": 0.76691592, "num_input_tokens_seen": 71625695, "step": 3318, "time_per_iteration": 2.6815896034240723 }, { "auxiliary_loss_clip": 0.01114942, "auxiliary_loss_mlp": 0.01044681, "balance_loss_clip": 1.04767084, "balance_loss_mlp": 1.02786088, "epoch": 0.19954907560499022, "flos": 37815803182080.0, "grad_norm": 1.8966456913695608, "language_loss": 0.78894758, "learning_rate": 3.7059464220103385e-06, "loss": 0.81054389, "num_input_tokens_seen": 71648520, "step": 3319, "time_per_iteration": 2.847911834716797 }, { "auxiliary_loss_clip": 0.01134557, "auxiliary_loss_mlp": 0.01042988, "balance_loss_clip": 1.05354095, "balance_loss_mlp": 1.02312756, "epoch": 0.1996091988576582, "flos": 49565199594240.0, "grad_norm": 2.1348540211051197, "language_loss": 0.76006937, "learning_rate": 3.7057431083144945e-06, "loss": 0.78184479, "num_input_tokens_seen": 71672185, "step": 3320, "time_per_iteration": 2.9324615001678467 }, { "auxiliary_loss_clip": 0.01120226, "auxiliary_loss_mlp": 0.01042998, "balance_loss_clip": 1.05083311, "balance_loss_mlp": 1.02496171, "epoch": 0.19966932211032618, "flos": 22635788659200.0, "grad_norm": 2.2436863685702546, "language_loss": 0.80077857, "learning_rate": 3.705539729936701e-06, "loss": 0.82241082, "num_input_tokens_seen": 71692890, "step": 3321, "time_per_iteration": 2.7534186840057373 }, { "auxiliary_loss_clip": 0.01033096, "auxiliary_loss_mlp": 0.01011167, "balance_loss_clip": 1.02391553, "balance_loss_mlp": 1.00828266, "epoch": 0.19972944536299414, "flos": 54082117745280.0, "grad_norm": 0.874673110280983, "language_loss": 0.65145189, "learning_rate": 3.7053362868846696e-06, "loss": 0.67189455, "num_input_tokens_seen": 71745815, "step": 3322, "time_per_iteration": 3.0398683547973633 }, { "auxiliary_loss_clip": 0.01039999, "auxiliary_loss_mlp": 0.01007775, "balance_loss_clip": 1.02971482, "balance_loss_mlp": 1.00479472, "epoch": 0.1997895686156621, "flos": 69355031817600.0, "grad_norm": 0.7915334307535052, "language_loss": 0.56919783, "learning_rate": 3.7051327791661153e-06, "loss": 0.58967561, "num_input_tokens_seen": 71806915, "step": 3323, "time_per_iteration": 3.2814581394195557 }, { "auxiliary_loss_clip": 0.01131487, "auxiliary_loss_mlp": 0.00776139, "balance_loss_clip": 1.05244064, "balance_loss_mlp": 1.00085235, "epoch": 0.19984969186833007, "flos": 18552063507840.0, "grad_norm": 1.8766856730809967, "language_loss": 0.80573648, "learning_rate": 3.7049292067887555e-06, "loss": 0.82481277, "num_input_tokens_seen": 71824645, "step": 3324, "time_per_iteration": 2.66456937789917 }, { "auxiliary_loss_clip": 0.01132572, "auxiliary_loss_mlp": 0.01050254, "balance_loss_clip": 1.04625165, "balance_loss_mlp": 1.03027487, "epoch": 0.19990981512099804, "flos": 26429678968320.0, "grad_norm": 2.4535669107623486, "language_loss": 0.53931105, "learning_rate": 3.7047255697603092e-06, "loss": 0.56113935, "num_input_tokens_seen": 71845125, "step": 3325, "time_per_iteration": 2.696556329727173 }, { "auxiliary_loss_clip": 0.01130165, "auxiliary_loss_mlp": 0.01050725, "balance_loss_clip": 1.05065942, "balance_loss_mlp": 1.03328443, "epoch": 0.19996993837366603, "flos": 16325997010560.0, "grad_norm": 2.1570763946475187, "language_loss": 0.86074936, "learning_rate": 3.7045218680884984e-06, "loss": 0.88255823, "num_input_tokens_seen": 71863500, "step": 3326, "time_per_iteration": 2.7167885303497314 }, { "auxiliary_loss_clip": 0.0115173, "auxiliary_loss_mlp": 0.01042065, "balance_loss_clip": 1.05427039, "balance_loss_mlp": 1.02511311, "epoch": 0.200030061626334, "flos": 20844169159680.0, "grad_norm": 2.0419576492150395, "language_loss": 0.71793801, "learning_rate": 3.7043181017810476e-06, "loss": 0.73987597, "num_input_tokens_seen": 71881845, "step": 3327, "time_per_iteration": 2.6097662448883057 }, { "auxiliary_loss_clip": 0.01131035, "auxiliary_loss_mlp": 0.01052756, "balance_loss_clip": 1.05146813, "balance_loss_mlp": 1.03290796, "epoch": 0.20009018487900196, "flos": 23762629198080.0, "grad_norm": 1.8948781463857982, "language_loss": 0.7668376, "learning_rate": 3.7041142708456833e-06, "loss": 0.78867549, "num_input_tokens_seen": 71900940, "step": 3328, "time_per_iteration": 2.6869349479675293 }, { "auxiliary_loss_clip": 0.01118681, "auxiliary_loss_mlp": 0.01044603, "balance_loss_clip": 1.04693103, "balance_loss_mlp": 1.02799726, "epoch": 0.20015030813166992, "flos": 28111555440000.0, "grad_norm": 2.0833377369651984, "language_loss": 0.69400644, "learning_rate": 3.7039103752901353e-06, "loss": 0.71563935, "num_input_tokens_seen": 71921925, "step": 3329, "time_per_iteration": 2.844280481338501 }, { "auxiliary_loss_clip": 0.01107384, "auxiliary_loss_mlp": 0.01069575, "balance_loss_clip": 1.04727411, "balance_loss_mlp": 1.04641271, "epoch": 0.2002104313843379, "flos": 26067160955520.0, "grad_norm": 3.099532194576676, "language_loss": 0.81395614, "learning_rate": 3.7037064151221353e-06, "loss": 0.83572567, "num_input_tokens_seen": 71941855, "step": 3330, "time_per_iteration": 2.841885566711426 }, { "auxiliary_loss_clip": 0.01137825, "auxiliary_loss_mlp": 0.01048123, "balance_loss_clip": 1.05147684, "balance_loss_mlp": 1.02977705, "epoch": 0.20027055463700585, "flos": 22966633854720.0, "grad_norm": 2.224132696455658, "language_loss": 0.76606882, "learning_rate": 3.703502390349417e-06, "loss": 0.78792834, "num_input_tokens_seen": 71960915, "step": 3331, "time_per_iteration": 2.7007360458374023 }, { "auxiliary_loss_clip": 0.01093521, "auxiliary_loss_mlp": 0.01069739, "balance_loss_clip": 1.04292202, "balance_loss_mlp": 1.04851985, "epoch": 0.20033067788967382, "flos": 17165660313600.0, "grad_norm": 2.044808670508971, "language_loss": 0.79330826, "learning_rate": 3.7032983009797176e-06, "loss": 0.81494087, "num_input_tokens_seen": 71979220, "step": 3332, "time_per_iteration": 4.518973112106323 }, { "auxiliary_loss_clip": 0.01046467, "auxiliary_loss_mlp": 0.010754, "balance_loss_clip": 1.02134657, "balance_loss_mlp": 1.07303989, "epoch": 0.2003908011423418, "flos": 60825566292480.0, "grad_norm": 0.9607431077817938, "language_loss": 0.61968678, "learning_rate": 3.703094147020776e-06, "loss": 0.64090544, "num_input_tokens_seen": 72033950, "step": 3333, "time_per_iteration": 3.074782371520996 }, { "auxiliary_loss_clip": 0.01112058, "auxiliary_loss_mlp": 0.00777645, "balance_loss_clip": 1.04686844, "balance_loss_mlp": 1.00099933, "epoch": 0.20045092439500978, "flos": 24206234163840.0, "grad_norm": 2.9954165903614447, "language_loss": 0.81385547, "learning_rate": 3.7028899284803334e-06, "loss": 0.83275253, "num_input_tokens_seen": 72051395, "step": 3334, "time_per_iteration": 4.270732641220093 }, { "auxiliary_loss_clip": 0.01096467, "auxiliary_loss_mlp": 0.01058699, "balance_loss_clip": 1.04709518, "balance_loss_mlp": 1.03889799, "epoch": 0.20051104764767774, "flos": 29387605075200.0, "grad_norm": 2.9016061168315703, "language_loss": 0.74238038, "learning_rate": 3.702685645366134e-06, "loss": 0.76393211, "num_input_tokens_seen": 72071305, "step": 3335, "time_per_iteration": 4.376626491546631 }, { "auxiliary_loss_clip": 0.01149242, "auxiliary_loss_mlp": 0.01059851, "balance_loss_clip": 1.05611062, "balance_loss_mlp": 1.04120684, "epoch": 0.2005711709003457, "flos": 23513804709120.0, "grad_norm": 1.700795836589561, "language_loss": 0.79981416, "learning_rate": 3.7024812976859243e-06, "loss": 0.82190514, "num_input_tokens_seen": 72090165, "step": 3336, "time_per_iteration": 2.7031586170196533 }, { "auxiliary_loss_clip": 0.01116655, "auxiliary_loss_mlp": 0.01048065, "balance_loss_clip": 1.04808092, "balance_loss_mlp": 1.0272038, "epoch": 0.20063129415301367, "flos": 22523388024960.0, "grad_norm": 2.0182523905302157, "language_loss": 0.7761423, "learning_rate": 3.7022768854474532e-06, "loss": 0.79778945, "num_input_tokens_seen": 72107210, "step": 3337, "time_per_iteration": 2.6990835666656494 }, { "auxiliary_loss_clip": 0.01158617, "auxiliary_loss_mlp": 0.01045618, "balance_loss_clip": 1.05752003, "balance_loss_mlp": 1.02631783, "epoch": 0.20069141740568164, "flos": 25958243940480.0, "grad_norm": 2.232061800350416, "language_loss": 0.69108742, "learning_rate": 3.7020724086584724e-06, "loss": 0.71312982, "num_input_tokens_seen": 72126315, "step": 3338, "time_per_iteration": 2.6827659606933594 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01053755, "balance_loss_clip": 1.04930723, "balance_loss_mlp": 1.03543282, "epoch": 0.2007515406583496, "flos": 24790608529920.0, "grad_norm": 2.685005372503905, "language_loss": 0.68898237, "learning_rate": 3.701867867326735e-06, "loss": 0.71069658, "num_input_tokens_seen": 72146470, "step": 3339, "time_per_iteration": 4.430418014526367 }, { "auxiliary_loss_clip": 0.01123098, "auxiliary_loss_mlp": 0.01041763, "balance_loss_clip": 1.05656064, "balance_loss_mlp": 1.02408433, "epoch": 0.2008116639110176, "flos": 37925582123520.0, "grad_norm": 2.0597617887640607, "language_loss": 0.66606021, "learning_rate": 3.7016632614599974e-06, "loss": 0.6877088, "num_input_tokens_seen": 72166600, "step": 3340, "time_per_iteration": 3.0020461082458496 }, { "auxiliary_loss_clip": 0.01145166, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.05326021, "balance_loss_mlp": 1.01712155, "epoch": 0.20087178716368556, "flos": 20740531443840.0, "grad_norm": 6.669810478748975, "language_loss": 0.74554622, "learning_rate": 3.701458591066019e-06, "loss": 0.76736599, "num_input_tokens_seen": 72185160, "step": 3341, "time_per_iteration": 2.762573480606079 }, { "auxiliary_loss_clip": 0.01110242, "auxiliary_loss_mlp": 0.01044424, "balance_loss_clip": 1.04981375, "balance_loss_mlp": 1.02595794, "epoch": 0.20093191041635353, "flos": 23842279607040.0, "grad_norm": 7.177474445031109, "language_loss": 0.71779013, "learning_rate": 3.70125385615256e-06, "loss": 0.73933673, "num_input_tokens_seen": 72205160, "step": 3342, "time_per_iteration": 2.7128167152404785 }, { "auxiliary_loss_clip": 0.01114025, "auxiliary_loss_mlp": 0.01045057, "balance_loss_clip": 1.05036438, "balance_loss_mlp": 1.02749765, "epoch": 0.2009920336690215, "flos": 21792067119360.0, "grad_norm": 2.3652416151608873, "language_loss": 0.72892809, "learning_rate": 3.701049056727384e-06, "loss": 0.75051892, "num_input_tokens_seen": 72223555, "step": 3343, "time_per_iteration": 2.8155410289764404 }, { "auxiliary_loss_clip": 0.01113341, "auxiliary_loss_mlp": 0.01046556, "balance_loss_clip": 1.04568779, "balance_loss_mlp": 1.02762532, "epoch": 0.20105215692168946, "flos": 26359222440960.0, "grad_norm": 2.2972411099560195, "language_loss": 0.80645263, "learning_rate": 3.7008441927982574e-06, "loss": 0.82805163, "num_input_tokens_seen": 72242465, "step": 3344, "time_per_iteration": 2.780198335647583 }, { "auxiliary_loss_clip": 0.01155099, "auxiliary_loss_mlp": 0.01045938, "balance_loss_clip": 1.05386972, "balance_loss_mlp": 1.02773499, "epoch": 0.20111228017435742, "flos": 18807280617600.0, "grad_norm": 2.2640230255386125, "language_loss": 0.83114576, "learning_rate": 3.700639264372948e-06, "loss": 0.85315621, "num_input_tokens_seen": 72260655, "step": 3345, "time_per_iteration": 2.6209781169891357 }, { "auxiliary_loss_clip": 0.01093716, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.04619193, "balance_loss_mlp": 1.02492619, "epoch": 0.20117240342702541, "flos": 19975059682560.0, "grad_norm": 1.7610524328763844, "language_loss": 0.67947632, "learning_rate": 3.7004342714592283e-06, "loss": 0.70082676, "num_input_tokens_seen": 72279055, "step": 3346, "time_per_iteration": 2.692222833633423 }, { "auxiliary_loss_clip": 0.01114086, "auxiliary_loss_mlp": 0.01048128, "balance_loss_clip": 1.04710329, "balance_loss_mlp": 1.03028262, "epoch": 0.20123252667969338, "flos": 23142703345920.0, "grad_norm": 2.3067659385334958, "language_loss": 0.72993439, "learning_rate": 3.70022921406487e-06, "loss": 0.75155658, "num_input_tokens_seen": 72297895, "step": 3347, "time_per_iteration": 2.7501564025878906 }, { "auxiliary_loss_clip": 0.01142236, "auxiliary_loss_mlp": 0.01047715, "balance_loss_clip": 1.05465829, "balance_loss_mlp": 1.03122878, "epoch": 0.20129264993236134, "flos": 23221671396480.0, "grad_norm": 1.5798788242702444, "language_loss": 0.86869538, "learning_rate": 3.70002409219765e-06, "loss": 0.8905949, "num_input_tokens_seen": 72318385, "step": 3348, "time_per_iteration": 2.688606023788452 }, { "auxiliary_loss_clip": 0.01099793, "auxiliary_loss_mlp": 0.01045183, "balance_loss_clip": 1.04737949, "balance_loss_mlp": 1.02587092, "epoch": 0.2013527731850293, "flos": 21871466133120.0, "grad_norm": 1.8024729376762028, "language_loss": 0.71082795, "learning_rate": 3.699818905865346e-06, "loss": 0.73227775, "num_input_tokens_seen": 72338235, "step": 3349, "time_per_iteration": 2.8423163890838623 }, { "auxiliary_loss_clip": 0.01119982, "auxiliary_loss_mlp": 0.01044662, "balance_loss_clip": 1.0504061, "balance_loss_mlp": 1.02520752, "epoch": 0.20141289643769728, "flos": 18040803275520.0, "grad_norm": 1.7324672298731074, "language_loss": 0.71324664, "learning_rate": 3.6996136550757377e-06, "loss": 0.73489314, "num_input_tokens_seen": 72357825, "step": 3350, "time_per_iteration": 2.7691454887390137 }, { "auxiliary_loss_clip": 0.01126392, "auxiliary_loss_mlp": 0.01043835, "balance_loss_clip": 1.0497458, "balance_loss_mlp": 1.02312887, "epoch": 0.20147301969036524, "flos": 23951412103680.0, "grad_norm": 2.3965463087123107, "language_loss": 0.76391226, "learning_rate": 3.69940833983661e-06, "loss": 0.78561449, "num_input_tokens_seen": 72376335, "step": 3351, "time_per_iteration": 2.701244592666626 }, { "auxiliary_loss_clip": 0.01134085, "auxiliary_loss_mlp": 0.01047695, "balance_loss_clip": 1.05303741, "balance_loss_mlp": 1.02840734, "epoch": 0.2015331429430332, "flos": 25588471380480.0, "grad_norm": 1.5574195085232978, "language_loss": 0.80808926, "learning_rate": 3.699202960155748e-06, "loss": 0.82990712, "num_input_tokens_seen": 72395440, "step": 3352, "time_per_iteration": 2.707792043685913 }, { "auxiliary_loss_clip": 0.011457, "auxiliary_loss_mlp": 0.01042883, "balance_loss_clip": 1.05415952, "balance_loss_mlp": 1.0244298, "epoch": 0.2015932661957012, "flos": 26724972677760.0, "grad_norm": 1.9831574274346238, "language_loss": 0.80594563, "learning_rate": 3.6989975160409396e-06, "loss": 0.82783151, "num_input_tokens_seen": 72414670, "step": 3353, "time_per_iteration": 2.675960063934326 }, { "auxiliary_loss_clip": 0.01126272, "auxiliary_loss_mlp": 0.01045978, "balance_loss_clip": 1.05195928, "balance_loss_mlp": 1.02787042, "epoch": 0.20165338944836916, "flos": 15633136592640.0, "grad_norm": 2.0684163707657763, "language_loss": 0.90046668, "learning_rate": 3.6987920074999747e-06, "loss": 0.92218912, "num_input_tokens_seen": 72432210, "step": 3354, "time_per_iteration": 2.6648361682891846 }, { "auxiliary_loss_clip": 0.0104514, "auxiliary_loss_mlp": 0.0075774, "balance_loss_clip": 1.0285337, "balance_loss_mlp": 1.00170481, "epoch": 0.20171351270103713, "flos": 57912529207680.0, "grad_norm": 0.8264169258847935, "language_loss": 0.55863291, "learning_rate": 3.6985864345406465e-06, "loss": 0.57666171, "num_input_tokens_seen": 72489225, "step": 3355, "time_per_iteration": 3.155352830886841 }, { "auxiliary_loss_clip": 0.01127799, "auxiliary_loss_mlp": 0.00776255, "balance_loss_clip": 1.05133796, "balance_loss_mlp": 1.00109434, "epoch": 0.2017736359537051, "flos": 20814363849600.0, "grad_norm": 1.8367443502770229, "language_loss": 0.84333616, "learning_rate": 3.698380797170751e-06, "loss": 0.86237669, "num_input_tokens_seen": 72508715, "step": 3356, "time_per_iteration": 2.754645586013794 }, { "auxiliary_loss_clip": 0.01127514, "auxiliary_loss_mlp": 0.01052066, "balance_loss_clip": 1.04904747, "balance_loss_mlp": 1.02811635, "epoch": 0.20183375920637306, "flos": 17092043389440.0, "grad_norm": 3.2349249330618504, "language_loss": 0.70046175, "learning_rate": 3.698175095398085e-06, "loss": 0.72225749, "num_input_tokens_seen": 72525135, "step": 3357, "time_per_iteration": 2.6905863285064697 }, { "auxiliary_loss_clip": 0.0113535, "auxiliary_loss_mlp": 0.01044956, "balance_loss_clip": 1.05209541, "balance_loss_mlp": 1.02590632, "epoch": 0.20189388245904102, "flos": 18661339658880.0, "grad_norm": 2.41944886120848, "language_loss": 0.7169627, "learning_rate": 3.6979693292304493e-06, "loss": 0.73876572, "num_input_tokens_seen": 72543690, "step": 3358, "time_per_iteration": 2.696295738220215 }, { "auxiliary_loss_clip": 0.01139673, "auxiliary_loss_mlp": 0.01052145, "balance_loss_clip": 1.05050206, "balance_loss_mlp": 1.03496706, "epoch": 0.20195400571170902, "flos": 16797539779200.0, "grad_norm": 2.6870341127491675, "language_loss": 0.83242267, "learning_rate": 3.6977634986756463e-06, "loss": 0.85434085, "num_input_tokens_seen": 72560725, "step": 3359, "time_per_iteration": 2.6779677867889404 }, { "auxiliary_loss_clip": 0.01052166, "auxiliary_loss_mlp": 0.01026452, "balance_loss_clip": 1.02534354, "balance_loss_mlp": 1.02345943, "epoch": 0.20201412896437698, "flos": 67174716268800.0, "grad_norm": 0.8259567660078829, "language_loss": 0.58980465, "learning_rate": 3.697557603741482e-06, "loss": 0.61059082, "num_input_tokens_seen": 72621940, "step": 3360, "time_per_iteration": 3.1175289154052734 }, { "auxiliary_loss_clip": 0.01096543, "auxiliary_loss_mlp": 0.01051237, "balance_loss_clip": 1.05081403, "balance_loss_mlp": 1.03154337, "epoch": 0.20207425221704495, "flos": 21325013550720.0, "grad_norm": 2.668010943284884, "language_loss": 0.63219774, "learning_rate": 3.697351644435763e-06, "loss": 0.65367556, "num_input_tokens_seen": 72639135, "step": 3361, "time_per_iteration": 2.7732017040252686 }, { "auxiliary_loss_clip": 0.01119862, "auxiliary_loss_mlp": 0.01069748, "balance_loss_clip": 1.04988885, "balance_loss_mlp": 1.05035317, "epoch": 0.2021343754697129, "flos": 22527158952960.0, "grad_norm": 1.9150118782569074, "language_loss": 0.75946522, "learning_rate": 3.6971456207662993e-06, "loss": 0.78136134, "num_input_tokens_seen": 72658525, "step": 3362, "time_per_iteration": 2.755686044692993 }, { "auxiliary_loss_clip": 0.01139499, "auxiliary_loss_mlp": 0.00777827, "balance_loss_clip": 1.05068207, "balance_loss_mlp": 1.0011797, "epoch": 0.20219449872238088, "flos": 19062785036160.0, "grad_norm": 2.043450343479612, "language_loss": 0.76542944, "learning_rate": 3.6969395327409035e-06, "loss": 0.78460264, "num_input_tokens_seen": 72678085, "step": 3363, "time_per_iteration": 2.788773775100708 }, { "auxiliary_loss_clip": 0.01143235, "auxiliary_loss_mlp": 0.01068217, "balance_loss_clip": 1.05241406, "balance_loss_mlp": 1.0511229, "epoch": 0.20225462197504884, "flos": 24717027519360.0, "grad_norm": 1.8380065969237507, "language_loss": 0.75088942, "learning_rate": 3.696733380367391e-06, "loss": 0.773004, "num_input_tokens_seen": 72698695, "step": 3364, "time_per_iteration": 2.7484803199768066 }, { "auxiliary_loss_clip": 0.01111683, "auxiliary_loss_mlp": 0.01065374, "balance_loss_clip": 1.05202723, "balance_loss_mlp": 1.04583549, "epoch": 0.2023147452277168, "flos": 22018304931840.0, "grad_norm": 2.1478979049108395, "language_loss": 0.71917796, "learning_rate": 3.6965271636535783e-06, "loss": 0.7409485, "num_input_tokens_seen": 72717880, "step": 3365, "time_per_iteration": 2.770939350128174 }, { "auxiliary_loss_clip": 0.01110149, "auxiliary_loss_mlp": 0.01064133, "balance_loss_clip": 1.04989934, "balance_loss_mlp": 1.04559648, "epoch": 0.2023748684803848, "flos": 17745365911680.0, "grad_norm": 2.2136098995040228, "language_loss": 0.85318875, "learning_rate": 3.696320882607286e-06, "loss": 0.87493157, "num_input_tokens_seen": 72736410, "step": 3366, "time_per_iteration": 2.717759609222412 }, { "auxiliary_loss_clip": 0.01116913, "auxiliary_loss_mlp": 0.0106476, "balance_loss_clip": 1.050488, "balance_loss_mlp": 1.04605615, "epoch": 0.20243499173305277, "flos": 31138932493440.0, "grad_norm": 2.048733189447585, "language_loss": 0.69766563, "learning_rate": 3.696114537236335e-06, "loss": 0.71948242, "num_input_tokens_seen": 72758295, "step": 3367, "time_per_iteration": 2.788444995880127 }, { "auxiliary_loss_clip": 0.01144949, "auxiliary_loss_mlp": 0.01060722, "balance_loss_clip": 1.04997301, "balance_loss_mlp": 1.03857303, "epoch": 0.20249511498572073, "flos": 33839235279360.0, "grad_norm": 1.942153338299175, "language_loss": 0.68162113, "learning_rate": 3.6959081275485512e-06, "loss": 0.70367789, "num_input_tokens_seen": 72782495, "step": 3368, "time_per_iteration": 2.7339746952056885 }, { "auxiliary_loss_clip": 0.01123527, "auxiliary_loss_mlp": 0.01063426, "balance_loss_clip": 1.0543493, "balance_loss_mlp": 1.04405439, "epoch": 0.2025552382383887, "flos": 21215629658880.0, "grad_norm": 1.8860162071579365, "language_loss": 0.77298439, "learning_rate": 3.6957016535517615e-06, "loss": 0.79485393, "num_input_tokens_seen": 72801885, "step": 3369, "time_per_iteration": 2.739088535308838 }, { "auxiliary_loss_clip": 0.01136965, "auxiliary_loss_mlp": 0.01071822, "balance_loss_clip": 1.05140853, "balance_loss_mlp": 1.05315351, "epoch": 0.20261536149105666, "flos": 14647388676480.0, "grad_norm": 2.9806431283259354, "language_loss": 0.65055734, "learning_rate": 3.695495115253795e-06, "loss": 0.67264521, "num_input_tokens_seen": 72816990, "step": 3370, "time_per_iteration": 2.7082977294921875 }, { "auxiliary_loss_clip": 0.0105828, "auxiliary_loss_mlp": 0.01019528, "balance_loss_clip": 1.03235602, "balance_loss_mlp": 1.01690567, "epoch": 0.20267548474372463, "flos": 66783649921920.0, "grad_norm": 0.678414814309544, "language_loss": 0.58126765, "learning_rate": 3.6952885126624834e-06, "loss": 0.60204571, "num_input_tokens_seen": 72879240, "step": 3371, "time_per_iteration": 4.805691242218018 }, { "auxiliary_loss_clip": 0.01117624, "auxiliary_loss_mlp": 0.01050757, "balance_loss_clip": 1.04833245, "balance_loss_mlp": 1.0329231, "epoch": 0.2027356079963926, "flos": 24680793674880.0, "grad_norm": 2.167047343870177, "language_loss": 0.91830015, "learning_rate": 3.6950818457856617e-06, "loss": 0.9399839, "num_input_tokens_seen": 72899030, "step": 3372, "time_per_iteration": 4.306687831878662 }, { "auxiliary_loss_clip": 0.01137734, "auxiliary_loss_mlp": 0.01057192, "balance_loss_clip": 1.05065978, "balance_loss_mlp": 1.03598428, "epoch": 0.20279573124906058, "flos": 26392762765440.0, "grad_norm": 2.1240220719821195, "language_loss": 0.78505349, "learning_rate": 3.694875114631167e-06, "loss": 0.80700278, "num_input_tokens_seen": 72919190, "step": 3373, "time_per_iteration": 4.223219394683838 }, { "auxiliary_loss_clip": 0.01091396, "auxiliary_loss_mlp": 0.01058555, "balance_loss_clip": 1.04464257, "balance_loss_mlp": 1.03719246, "epoch": 0.20285585450172855, "flos": 33799984692480.0, "grad_norm": 2.5403716567908745, "language_loss": 0.71275264, "learning_rate": 3.6946683192068377e-06, "loss": 0.7342521, "num_input_tokens_seen": 72939720, "step": 3374, "time_per_iteration": 2.853079319000244 }, { "auxiliary_loss_clip": 0.01042818, "auxiliary_loss_mlp": 0.01010518, "balance_loss_clip": 1.02580416, "balance_loss_mlp": 1.00797904, "epoch": 0.20291597775439651, "flos": 71164823598720.0, "grad_norm": 0.9711663240936556, "language_loss": 0.62466931, "learning_rate": 3.694461459520516e-06, "loss": 0.64520264, "num_input_tokens_seen": 73000015, "step": 3375, "time_per_iteration": 3.2016799449920654 }, { "auxiliary_loss_clip": 0.01153133, "auxiliary_loss_mlp": 0.01048539, "balance_loss_clip": 1.05278802, "balance_loss_mlp": 1.03021622, "epoch": 0.20297610100706448, "flos": 19494287118720.0, "grad_norm": 1.613636998778186, "language_loss": 0.82316196, "learning_rate": 3.6942545355800463e-06, "loss": 0.84517872, "num_input_tokens_seen": 73017675, "step": 3376, "time_per_iteration": 2.6073458194732666 }, { "auxiliary_loss_clip": 0.01142412, "auxiliary_loss_mlp": 0.01038523, "balance_loss_clip": 1.0506475, "balance_loss_mlp": 1.01912737, "epoch": 0.20303622425973245, "flos": 25044245441280.0, "grad_norm": 2.0454517065820026, "language_loss": 0.81243992, "learning_rate": 3.6940475473932743e-06, "loss": 0.83424926, "num_input_tokens_seen": 73036135, "step": 3377, "time_per_iteration": 2.6802914142608643 }, { "auxiliary_loss_clip": 0.01127133, "auxiliary_loss_mlp": 0.01049784, "balance_loss_clip": 1.05416846, "balance_loss_mlp": 1.03053212, "epoch": 0.2030963475124004, "flos": 21979988098560.0, "grad_norm": 1.9719049052811064, "language_loss": 0.76726258, "learning_rate": 3.69384049496805e-06, "loss": 0.78903174, "num_input_tokens_seen": 73054075, "step": 3378, "time_per_iteration": 2.7052531242370605 }, { "auxiliary_loss_clip": 0.01087342, "auxiliary_loss_mlp": 0.01049115, "balance_loss_clip": 1.04531622, "balance_loss_mlp": 1.02726364, "epoch": 0.2031564707650684, "flos": 19500392430720.0, "grad_norm": 2.0079998756584017, "language_loss": 0.7982831, "learning_rate": 3.6936333783122242e-06, "loss": 0.81964767, "num_input_tokens_seen": 73073530, "step": 3379, "time_per_iteration": 4.379331588745117 }, { "auxiliary_loss_clip": 0.01139431, "auxiliary_loss_mlp": 0.01039085, "balance_loss_clip": 1.05384874, "balance_loss_mlp": 1.02164412, "epoch": 0.20321659401773637, "flos": 22747075971840.0, "grad_norm": 1.5868581768713355, "language_loss": 0.86639273, "learning_rate": 3.6934261974336505e-06, "loss": 0.88817787, "num_input_tokens_seen": 73092820, "step": 3380, "time_per_iteration": 2.7405402660369873 }, { "auxiliary_loss_clip": 0.01156702, "auxiliary_loss_mlp": 0.01053775, "balance_loss_clip": 1.05730438, "balance_loss_mlp": 1.03507149, "epoch": 0.20327671727040433, "flos": 22455840499200.0, "grad_norm": 2.063467458189152, "language_loss": 0.74637043, "learning_rate": 3.693218952340186e-06, "loss": 0.76847517, "num_input_tokens_seen": 73113385, "step": 3381, "time_per_iteration": 2.6237549781799316 }, { "auxiliary_loss_clip": 0.01118794, "auxiliary_loss_mlp": 0.01042351, "balance_loss_clip": 1.04590273, "balance_loss_mlp": 1.02289653, "epoch": 0.2033368405230723, "flos": 19535010163200.0, "grad_norm": 1.6994666268173182, "language_loss": 0.79167414, "learning_rate": 3.6930116430396895e-06, "loss": 0.81328559, "num_input_tokens_seen": 73131195, "step": 3382, "time_per_iteration": 2.6707420349121094 }, { "auxiliary_loss_clip": 0.01113758, "auxiliary_loss_mlp": 0.00779415, "balance_loss_clip": 1.0459373, "balance_loss_mlp": 1.00091934, "epoch": 0.20339696377574026, "flos": 13809233744640.0, "grad_norm": 1.9483404178521286, "language_loss": 0.8042953, "learning_rate": 3.6928042695400214e-06, "loss": 0.82322699, "num_input_tokens_seen": 73148850, "step": 3383, "time_per_iteration": 2.7859487533569336 }, { "auxiliary_loss_clip": 0.01100731, "auxiliary_loss_mlp": 0.01046151, "balance_loss_clip": 1.04473877, "balance_loss_mlp": 1.02621913, "epoch": 0.20345708702840823, "flos": 20339409288960.0, "grad_norm": 3.0507793260875693, "language_loss": 0.74539214, "learning_rate": 3.6925968318490464e-06, "loss": 0.76686096, "num_input_tokens_seen": 73166775, "step": 3384, "time_per_iteration": 2.802645206451416 }, { "auxiliary_loss_clip": 0.0114772, "auxiliary_loss_mlp": 0.01042851, "balance_loss_clip": 1.05207324, "balance_loss_mlp": 1.02232289, "epoch": 0.2035172102810762, "flos": 20333950421760.0, "grad_norm": 7.661095363155204, "language_loss": 0.76801658, "learning_rate": 3.6923893299746293e-06, "loss": 0.7899223, "num_input_tokens_seen": 73183215, "step": 3385, "time_per_iteration": 2.823343515396118 }, { "auxiliary_loss_clip": 0.01107407, "auxiliary_loss_mlp": 0.01063941, "balance_loss_clip": 1.04730904, "balance_loss_mlp": 1.04331779, "epoch": 0.2035773335337442, "flos": 23330983461120.0, "grad_norm": 41.05937457193927, "language_loss": 0.68458641, "learning_rate": 3.692181763924639e-06, "loss": 0.70629984, "num_input_tokens_seen": 73203290, "step": 3386, "time_per_iteration": 2.830810546875 }, { "auxiliary_loss_clip": 0.01104248, "auxiliary_loss_mlp": 0.01064893, "balance_loss_clip": 1.04774165, "balance_loss_mlp": 1.04379284, "epoch": 0.20363745678641215, "flos": 28330287310080.0, "grad_norm": 3.4161658794101384, "language_loss": 0.80985248, "learning_rate": 3.691974133706947e-06, "loss": 0.83154386, "num_input_tokens_seen": 73226185, "step": 3387, "time_per_iteration": 2.8204662799835205 }, { "auxiliary_loss_clip": 0.0112504, "auxiliary_loss_mlp": 0.01049361, "balance_loss_clip": 1.05224109, "balance_loss_mlp": 1.03000104, "epoch": 0.20369758003908012, "flos": 18915658928640.0, "grad_norm": 2.703878094865874, "language_loss": 0.7988956, "learning_rate": 3.6917664393294262e-06, "loss": 0.82063961, "num_input_tokens_seen": 73243300, "step": 3388, "time_per_iteration": 2.687053918838501 }, { "auxiliary_loss_clip": 0.01157403, "auxiliary_loss_mlp": 0.01048089, "balance_loss_clip": 1.05471182, "balance_loss_mlp": 1.0281812, "epoch": 0.20375770329174808, "flos": 19206499351680.0, "grad_norm": 1.8133180655285324, "language_loss": 0.7184962, "learning_rate": 3.6915586807999527e-06, "loss": 0.74055111, "num_input_tokens_seen": 73261490, "step": 3389, "time_per_iteration": 2.614321708679199 }, { "auxiliary_loss_clip": 0.01141855, "auxiliary_loss_mlp": 0.01054311, "balance_loss_clip": 1.05387521, "balance_loss_mlp": 1.0351541, "epoch": 0.20381782654441605, "flos": 19391008538880.0, "grad_norm": 1.8982692343761227, "language_loss": 0.87280858, "learning_rate": 3.691350858126404e-06, "loss": 0.89477026, "num_input_tokens_seen": 73280180, "step": 3390, "time_per_iteration": 2.6770312786102295 }, { "auxiliary_loss_clip": 0.01125093, "auxiliary_loss_mlp": 0.01052498, "balance_loss_clip": 1.05142403, "balance_loss_mlp": 1.03129053, "epoch": 0.203877949797084, "flos": 24827704300800.0, "grad_norm": 2.3308941901233355, "language_loss": 0.71194077, "learning_rate": 3.691142971316662e-06, "loss": 0.73371667, "num_input_tokens_seen": 73300680, "step": 3391, "time_per_iteration": 2.7198221683502197 }, { "auxiliary_loss_clip": 0.01120121, "auxiliary_loss_mlp": 0.01051383, "balance_loss_clip": 1.05222178, "balance_loss_mlp": 1.0318923, "epoch": 0.20393807304975198, "flos": 18003707504640.0, "grad_norm": 2.4765720957839217, "language_loss": 0.86745828, "learning_rate": 3.6909350203786086e-06, "loss": 0.88917333, "num_input_tokens_seen": 73316760, "step": 3392, "time_per_iteration": 2.6961052417755127 }, { "auxiliary_loss_clip": 0.01145712, "auxiliary_loss_mlp": 0.01051212, "balance_loss_clip": 1.05204964, "balance_loss_mlp": 1.03236461, "epoch": 0.20399819630241997, "flos": 24206988349440.0, "grad_norm": 1.665333238668028, "language_loss": 0.80659354, "learning_rate": 3.69072700532013e-06, "loss": 0.82856286, "num_input_tokens_seen": 73339385, "step": 3393, "time_per_iteration": 2.6883490085601807 }, { "auxiliary_loss_clip": 0.01123025, "auxiliary_loss_mlp": 0.010424, "balance_loss_clip": 1.04751348, "balance_loss_mlp": 1.02385163, "epoch": 0.20405831955508794, "flos": 20777124424320.0, "grad_norm": 1.8745864895680615, "language_loss": 0.86126244, "learning_rate": 3.6905189261491137e-06, "loss": 0.88291663, "num_input_tokens_seen": 73357235, "step": 3394, "time_per_iteration": 2.758887767791748 }, { "auxiliary_loss_clip": 0.0114219, "auxiliary_loss_mlp": 0.01049288, "balance_loss_clip": 1.05699492, "balance_loss_mlp": 1.03088212, "epoch": 0.2041184428077559, "flos": 15486908325120.0, "grad_norm": 2.5133342949273416, "language_loss": 0.83761692, "learning_rate": 3.69031078287345e-06, "loss": 0.85953164, "num_input_tokens_seen": 73374435, "step": 3395, "time_per_iteration": 2.6468729972839355 }, { "auxiliary_loss_clip": 0.01145796, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.05311751, "balance_loss_mlp": 1.0200156, "epoch": 0.20417856606042387, "flos": 15588463052160.0, "grad_norm": 2.8477422591662376, "language_loss": 0.83736277, "learning_rate": 3.690102575501033e-06, "loss": 0.85921878, "num_input_tokens_seen": 73391025, "step": 3396, "time_per_iteration": 2.6296958923339844 }, { "auxiliary_loss_clip": 0.01112843, "auxiliary_loss_mlp": 0.01045334, "balance_loss_clip": 1.04787922, "balance_loss_mlp": 1.02616525, "epoch": 0.20423868931309183, "flos": 24279348297600.0, "grad_norm": 2.1192113228666303, "language_loss": 0.77199841, "learning_rate": 3.6898943040397556e-06, "loss": 0.79358017, "num_input_tokens_seen": 73409270, "step": 3397, "time_per_iteration": 2.776784896850586 }, { "auxiliary_loss_clip": 0.01128614, "auxiliary_loss_mlp": 0.01050131, "balance_loss_clip": 1.05143905, "balance_loss_mlp": 1.03264332, "epoch": 0.2042988125657598, "flos": 18614870438400.0, "grad_norm": 3.16091809956727, "language_loss": 0.8791461, "learning_rate": 3.689685968497518e-06, "loss": 0.9009335, "num_input_tokens_seen": 73425225, "step": 3398, "time_per_iteration": 2.6866374015808105 }, { "auxiliary_loss_clip": 0.01126796, "auxiliary_loss_mlp": 0.01052169, "balance_loss_clip": 1.05476117, "balance_loss_mlp": 1.03316689, "epoch": 0.2043589358184278, "flos": 17851230270720.0, "grad_norm": 2.139785862197821, "language_loss": 0.78045064, "learning_rate": 3.6894775688822186e-06, "loss": 0.80224031, "num_input_tokens_seen": 73440940, "step": 3399, "time_per_iteration": 2.6545825004577637 }, { "auxiliary_loss_clip": 0.01144155, "auxiliary_loss_mlp": 0.01042424, "balance_loss_clip": 1.05252838, "balance_loss_mlp": 1.02299261, "epoch": 0.20441905907109575, "flos": 21435223455360.0, "grad_norm": 3.6374157446104802, "language_loss": 0.76563728, "learning_rate": 3.6892691052017603e-06, "loss": 0.787503, "num_input_tokens_seen": 73458805, "step": 3400, "time_per_iteration": 2.7279481887817383 }, { "auxiliary_loss_clip": 0.01121071, "auxiliary_loss_mlp": 0.00776799, "balance_loss_clip": 1.05304742, "balance_loss_mlp": 1.00072634, "epoch": 0.20447918232376372, "flos": 27707703851520.0, "grad_norm": 1.8758513970592474, "language_loss": 0.79382575, "learning_rate": 3.6890605774640487e-06, "loss": 0.81280446, "num_input_tokens_seen": 73479380, "step": 3401, "time_per_iteration": 2.7918031215667725 }, { "auxiliary_loss_clip": 0.01131319, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.0484674, "balance_loss_mlp": 1.02540183, "epoch": 0.20453930557643168, "flos": 30524214113280.0, "grad_norm": 2.2159471948141034, "language_loss": 0.69798994, "learning_rate": 3.688851985676991e-06, "loss": 0.71975207, "num_input_tokens_seen": 73505105, "step": 3402, "time_per_iteration": 2.79670786857605 }, { "auxiliary_loss_clip": 0.01120554, "auxiliary_loss_mlp": 0.01043946, "balance_loss_clip": 1.05060196, "balance_loss_mlp": 1.02439535, "epoch": 0.20459942882909965, "flos": 18987767481600.0, "grad_norm": 1.7908768446457861, "language_loss": 0.81114817, "learning_rate": 3.688643329848496e-06, "loss": 0.83279312, "num_input_tokens_seen": 73523700, "step": 3403, "time_per_iteration": 2.70182728767395 }, { "auxiliary_loss_clip": 0.01144248, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.05348516, "balance_loss_mlp": 1.02295971, "epoch": 0.20465955208176762, "flos": 20339050152960.0, "grad_norm": 2.511955552730785, "language_loss": 0.83403814, "learning_rate": 3.6884346099864772e-06, "loss": 0.8558926, "num_input_tokens_seen": 73542625, "step": 3404, "time_per_iteration": 2.630807399749756 }, { "auxiliary_loss_clip": 0.01138937, "auxiliary_loss_mlp": 0.01048101, "balance_loss_clip": 1.04838705, "balance_loss_mlp": 1.0292058, "epoch": 0.20471967533443558, "flos": 21251288885760.0, "grad_norm": 1.7149716538767368, "language_loss": 0.86209136, "learning_rate": 3.6882258260988487e-06, "loss": 0.88396174, "num_input_tokens_seen": 73561450, "step": 3405, "time_per_iteration": 2.6076929569244385 }, { "auxiliary_loss_clip": 0.01116224, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.05039132, "balance_loss_mlp": 1.02621806, "epoch": 0.20477979858710357, "flos": 14501555458560.0, "grad_norm": 2.1633598971137435, "language_loss": 0.84356105, "learning_rate": 3.6880169781935276e-06, "loss": 0.86516619, "num_input_tokens_seen": 73577155, "step": 3406, "time_per_iteration": 2.768890142440796 }, { "auxiliary_loss_clip": 0.01152751, "auxiliary_loss_mlp": 0.01039548, "balance_loss_clip": 1.0542599, "balance_loss_mlp": 1.02191663, "epoch": 0.20483992183977154, "flos": 11400310085760.0, "grad_norm": 2.4892039461455675, "language_loss": 0.67453218, "learning_rate": 3.6878080662784336e-06, "loss": 0.69645512, "num_input_tokens_seen": 73594900, "step": 3407, "time_per_iteration": 2.5661377906799316 }, { "auxiliary_loss_clip": 0.0115175, "auxiliary_loss_mlp": 0.01050505, "balance_loss_clip": 1.05328465, "balance_loss_mlp": 1.03294516, "epoch": 0.2049000450924395, "flos": 19060271084160.0, "grad_norm": 2.4363182538361285, "language_loss": 0.84214294, "learning_rate": 3.6875990903614886e-06, "loss": 0.86416554, "num_input_tokens_seen": 73613810, "step": 3408, "time_per_iteration": 2.585186004638672 }, { "auxiliary_loss_clip": 0.01154901, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.0536257, "balance_loss_mlp": 1.02471161, "epoch": 0.20496016834510747, "flos": 14574561851520.0, "grad_norm": 2.317815935455145, "language_loss": 0.63898516, "learning_rate": 3.6873900504506166e-06, "loss": 0.6609571, "num_input_tokens_seen": 73631495, "step": 3409, "time_per_iteration": 2.5877959728240967 }, { "auxiliary_loss_clip": 0.0113795, "auxiliary_loss_mlp": 0.01042481, "balance_loss_clip": 1.04903567, "balance_loss_mlp": 1.02409852, "epoch": 0.20502029159777543, "flos": 22126647329280.0, "grad_norm": 1.3925959707869588, "language_loss": 0.80547982, "learning_rate": 3.687180946553745e-06, "loss": 0.8272841, "num_input_tokens_seen": 73652840, "step": 3410, "time_per_iteration": 4.1697752475738525 }, { "auxiliary_loss_clip": 0.01099823, "auxiliary_loss_mlp": 0.01046015, "balance_loss_clip": 1.05186486, "balance_loss_mlp": 1.02820492, "epoch": 0.2050804148504434, "flos": 25367907916800.0, "grad_norm": 2.407452066099965, "language_loss": 0.75804615, "learning_rate": 3.686971778678803e-06, "loss": 0.77950454, "num_input_tokens_seen": 73672150, "step": 3411, "time_per_iteration": 2.8072102069854736 }, { "auxiliary_loss_clip": 0.0113879, "auxiliary_loss_mlp": 0.01046868, "balance_loss_clip": 1.05501246, "balance_loss_mlp": 1.02887905, "epoch": 0.2051405381031114, "flos": 23620171858560.0, "grad_norm": 2.4936494073109445, "language_loss": 0.73356283, "learning_rate": 3.686762546833722e-06, "loss": 0.75541937, "num_input_tokens_seen": 73691940, "step": 3412, "time_per_iteration": 5.778446912765503 }, { "auxiliary_loss_clip": 0.01127692, "auxiliary_loss_mlp": 0.01057937, "balance_loss_clip": 1.04926813, "balance_loss_mlp": 1.03748107, "epoch": 0.20520066135577936, "flos": 19565533745280.0, "grad_norm": 2.3541654180764353, "language_loss": 0.77958596, "learning_rate": 3.6865532510264362e-06, "loss": 0.80144227, "num_input_tokens_seen": 73709080, "step": 3413, "time_per_iteration": 2.6457245349884033 }, { "auxiliary_loss_clip": 0.0110869, "auxiliary_loss_mlp": 0.01047866, "balance_loss_clip": 1.04991519, "balance_loss_mlp": 1.02862608, "epoch": 0.20526078460844732, "flos": 17676345928320.0, "grad_norm": 2.4834314093653673, "language_loss": 0.85112405, "learning_rate": 3.6863438912648823e-06, "loss": 0.8726896, "num_input_tokens_seen": 73727670, "step": 3414, "time_per_iteration": 2.7343668937683105 }, { "auxiliary_loss_clip": 0.01140219, "auxiliary_loss_mlp": 0.01039448, "balance_loss_clip": 1.05012155, "balance_loss_mlp": 1.02118468, "epoch": 0.2053209078611153, "flos": 21500328856320.0, "grad_norm": 2.0410772094937433, "language_loss": 0.80372798, "learning_rate": 3.6861344675569986e-06, "loss": 0.82552463, "num_input_tokens_seen": 73747170, "step": 3415, "time_per_iteration": 2.6669082641601562 }, { "auxiliary_loss_clip": 0.01087022, "auxiliary_loss_mlp": 0.01042771, "balance_loss_clip": 1.04786301, "balance_loss_mlp": 1.02643943, "epoch": 0.20538103111378325, "flos": 25663524848640.0, "grad_norm": 1.941742032659622, "language_loss": 0.72958827, "learning_rate": 3.6859249799107275e-06, "loss": 0.75088626, "num_input_tokens_seen": 73767690, "step": 3416, "time_per_iteration": 2.892782211303711 }, { "auxiliary_loss_clip": 0.01145149, "auxiliary_loss_mlp": 0.01044328, "balance_loss_clip": 1.05453372, "balance_loss_mlp": 1.02577877, "epoch": 0.20544115436645122, "flos": 23148952312320.0, "grad_norm": 2.508583707985938, "language_loss": 0.78741407, "learning_rate": 3.6857154283340115e-06, "loss": 0.80930889, "num_input_tokens_seen": 73786900, "step": 3417, "time_per_iteration": 2.7298929691314697 }, { "auxiliary_loss_clip": 0.01145459, "auxiliary_loss_mlp": 0.0104683, "balance_loss_clip": 1.0536468, "balance_loss_mlp": 1.02819777, "epoch": 0.20550127761911918, "flos": 19390433921280.0, "grad_norm": 2.4305498920504043, "language_loss": 0.8729043, "learning_rate": 3.685505812834798e-06, "loss": 0.89482725, "num_input_tokens_seen": 73804515, "step": 3418, "time_per_iteration": 4.382033109664917 }, { "auxiliary_loss_clip": 0.01140182, "auxiliary_loss_mlp": 0.01046543, "balance_loss_clip": 1.05682349, "balance_loss_mlp": 1.02776778, "epoch": 0.20556140087178718, "flos": 22893124671360.0, "grad_norm": 14.690715253896212, "language_loss": 0.62538671, "learning_rate": 3.685296133421035e-06, "loss": 0.64725399, "num_input_tokens_seen": 73822910, "step": 3419, "time_per_iteration": 2.7318668365478516 }, { "auxiliary_loss_clip": 0.01139691, "auxiliary_loss_mlp": 0.01046928, "balance_loss_clip": 1.05550981, "balance_loss_mlp": 1.02651954, "epoch": 0.20562152412445514, "flos": 19789652655360.0, "grad_norm": 1.8153871521224594, "language_loss": 0.86339438, "learning_rate": 3.685086390100674e-06, "loss": 0.88526058, "num_input_tokens_seen": 73841160, "step": 3420, "time_per_iteration": 2.723606824874878 }, { "auxiliary_loss_clip": 0.01104401, "auxiliary_loss_mlp": 0.00780617, "balance_loss_clip": 1.04621911, "balance_loss_mlp": 1.00071514, "epoch": 0.2056816473771231, "flos": 31501989210240.0, "grad_norm": 2.3982854973621954, "language_loss": 0.7127136, "learning_rate": 3.684876582881668e-06, "loss": 0.73156381, "num_input_tokens_seen": 73862795, "step": 3421, "time_per_iteration": 2.8138315677642822 }, { "auxiliary_loss_clip": 0.01153254, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.05382609, "balance_loss_mlp": 1.02160168, "epoch": 0.20574177062979107, "flos": 23258372117760.0, "grad_norm": 6.231519820465981, "language_loss": 0.70559299, "learning_rate": 3.6846667117719732e-06, "loss": 0.72752541, "num_input_tokens_seen": 73881525, "step": 3422, "time_per_iteration": 2.6411848068237305 }, { "auxiliary_loss_clip": 0.01062123, "auxiliary_loss_mlp": 0.01005097, "balance_loss_clip": 1.03459418, "balance_loss_mlp": 1.00220013, "epoch": 0.20580189388245904, "flos": 70312518708480.0, "grad_norm": 0.740118932422812, "language_loss": 0.55461621, "learning_rate": 3.684456776779548e-06, "loss": 0.57528841, "num_input_tokens_seen": 73937775, "step": 3423, "time_per_iteration": 3.259685516357422 }, { "auxiliary_loss_clip": 0.01104389, "auxiliary_loss_mlp": 0.01039296, "balance_loss_clip": 1.04975653, "balance_loss_mlp": 1.02089024, "epoch": 0.205862017135127, "flos": 30737846252160.0, "grad_norm": 1.9242047681435088, "language_loss": 0.71910381, "learning_rate": 3.684246777912353e-06, "loss": 0.74054068, "num_input_tokens_seen": 73958250, "step": 3424, "time_per_iteration": 2.800283432006836 }, { "auxiliary_loss_clip": 0.01125916, "auxiliary_loss_mlp": 0.00777945, "balance_loss_clip": 1.05704927, "balance_loss_mlp": 1.00086677, "epoch": 0.20592214038779497, "flos": 21324546673920.0, "grad_norm": 1.6235965502825092, "language_loss": 0.74980927, "learning_rate": 3.684036715178351e-06, "loss": 0.76884782, "num_input_tokens_seen": 73977775, "step": 3425, "time_per_iteration": 2.751030206680298 }, { "auxiliary_loss_clip": 0.01104665, "auxiliary_loss_mlp": 0.01058685, "balance_loss_clip": 1.05047321, "balance_loss_mlp": 1.03983784, "epoch": 0.20598226364046296, "flos": 22891652213760.0, "grad_norm": 1.7765616723027935, "language_loss": 0.87936616, "learning_rate": 3.683826588585508e-06, "loss": 0.90099961, "num_input_tokens_seen": 73996590, "step": 3426, "time_per_iteration": 2.8539180755615234 }, { "auxiliary_loss_clip": 0.01144422, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.05773449, "balance_loss_mlp": 1.0281601, "epoch": 0.20604238689313092, "flos": 23878549365120.0, "grad_norm": 1.836530467647624, "language_loss": 0.76435733, "learning_rate": 3.6836163981417926e-06, "loss": 0.78625643, "num_input_tokens_seen": 74015935, "step": 3427, "time_per_iteration": 2.7024967670440674 }, { "auxiliary_loss_clip": 0.01159387, "auxiliary_loss_mlp": 0.01050023, "balance_loss_clip": 1.0577209, "balance_loss_mlp": 1.03185558, "epoch": 0.2061025101457989, "flos": 22491535639680.0, "grad_norm": 2.7350574840199964, "language_loss": 0.74176943, "learning_rate": 3.683406143855174e-06, "loss": 0.76386356, "num_input_tokens_seen": 74036575, "step": 3428, "time_per_iteration": 2.593151569366455 }, { "auxiliary_loss_clip": 0.01132797, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.05232322, "balance_loss_mlp": 1.0274843, "epoch": 0.20616263339846685, "flos": 22778928357120.0, "grad_norm": 3.829070534376961, "language_loss": 0.73316109, "learning_rate": 3.6831958257336256e-06, "loss": 0.75495446, "num_input_tokens_seen": 74055365, "step": 3429, "time_per_iteration": 2.7357261180877686 }, { "auxiliary_loss_clip": 0.01144108, "auxiliary_loss_mlp": 0.01049081, "balance_loss_clip": 1.05838966, "balance_loss_mlp": 1.03030515, "epoch": 0.20622275665113482, "flos": 20882198684160.0, "grad_norm": 2.201354934958512, "language_loss": 0.85586745, "learning_rate": 3.6829854437851237e-06, "loss": 0.87779927, "num_input_tokens_seen": 74074875, "step": 3430, "time_per_iteration": 2.658486843109131 }, { "auxiliary_loss_clip": 0.01088509, "auxiliary_loss_mlp": 0.01053254, "balance_loss_clip": 1.04814601, "balance_loss_mlp": 1.03387105, "epoch": 0.20628287990380278, "flos": 19354415558400.0, "grad_norm": 1.8292569880077065, "language_loss": 0.68859613, "learning_rate": 3.6827749980176444e-06, "loss": 0.71001375, "num_input_tokens_seen": 74094505, "step": 3431, "time_per_iteration": 2.811061143875122 }, { "auxiliary_loss_clip": 0.01027012, "auxiliary_loss_mlp": 0.01012446, "balance_loss_clip": 1.03099978, "balance_loss_mlp": 1.00976419, "epoch": 0.20634300315647078, "flos": 71517932248320.0, "grad_norm": 0.8066063325789609, "language_loss": 0.60172188, "learning_rate": 3.6825644884391693e-06, "loss": 0.62211645, "num_input_tokens_seen": 74158500, "step": 3432, "time_per_iteration": 3.415828227996826 }, { "auxiliary_loss_clip": 0.01146488, "auxiliary_loss_mlp": 0.01044703, "balance_loss_clip": 1.0583806, "balance_loss_mlp": 1.02669072, "epoch": 0.20640312640913874, "flos": 21723944976000.0, "grad_norm": 2.5535613418278116, "language_loss": 0.72622889, "learning_rate": 3.682353915057679e-06, "loss": 0.74814081, "num_input_tokens_seen": 74176685, "step": 3433, "time_per_iteration": 2.715195655822754 }, { "auxiliary_loss_clip": 0.0109694, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.04781306, "balance_loss_mlp": 1.03019655, "epoch": 0.2064632496618067, "flos": 20554621626240.0, "grad_norm": 2.096486283687917, "language_loss": 0.87233114, "learning_rate": 3.6821432778811604e-06, "loss": 0.8938092, "num_input_tokens_seen": 74194935, "step": 3434, "time_per_iteration": 2.7781460285186768 }, { "auxiliary_loss_clip": 0.01151381, "auxiliary_loss_mlp": 0.01045497, "balance_loss_clip": 1.05561388, "balance_loss_mlp": 1.02719867, "epoch": 0.20652337291447467, "flos": 29823273135360.0, "grad_norm": 1.7621185839090663, "language_loss": 0.69533503, "learning_rate": 3.6819325769176004e-06, "loss": 0.71730381, "num_input_tokens_seen": 74215400, "step": 3435, "time_per_iteration": 2.7425992488861084 }, { "auxiliary_loss_clip": 0.01127853, "auxiliary_loss_mlp": 0.01045604, "balance_loss_clip": 1.05583, "balance_loss_mlp": 1.02672172, "epoch": 0.20658349616714264, "flos": 26213640618240.0, "grad_norm": 30.077934868422773, "language_loss": 0.89116997, "learning_rate": 3.681721812174988e-06, "loss": 0.91290456, "num_input_tokens_seen": 74234090, "step": 3436, "time_per_iteration": 2.7460577487945557 }, { "auxiliary_loss_clip": 0.01118033, "auxiliary_loss_mlp": 0.01041557, "balance_loss_clip": 1.05178559, "balance_loss_mlp": 1.02168477, "epoch": 0.2066436194198106, "flos": 25994370044160.0, "grad_norm": 1.7370712778981523, "language_loss": 0.77330887, "learning_rate": 3.6815109836613163e-06, "loss": 0.79490477, "num_input_tokens_seen": 74253345, "step": 3437, "time_per_iteration": 2.7507588863372803 }, { "auxiliary_loss_clip": 0.01144607, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.05298507, "balance_loss_mlp": 1.02323389, "epoch": 0.20670374267247857, "flos": 21361067827200.0, "grad_norm": 1.8326742989814773, "language_loss": 0.77813125, "learning_rate": 3.6813000913845795e-06, "loss": 0.799981, "num_input_tokens_seen": 74271615, "step": 3438, "time_per_iteration": 2.7624385356903076 }, { "auxiliary_loss_clip": 0.01063811, "auxiliary_loss_mlp": 0.01002308, "balance_loss_clip": 1.03603387, "balance_loss_mlp": 0.9995541, "epoch": 0.20676386592514656, "flos": 66383281952640.0, "grad_norm": 0.8298524953876073, "language_loss": 0.67093015, "learning_rate": 3.6810891353527747e-06, "loss": 0.69159138, "num_input_tokens_seen": 74331390, "step": 3439, "time_per_iteration": 3.2026216983795166 }, { "auxiliary_loss_clip": 0.01148913, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.05590546, "balance_loss_mlp": 1.02299786, "epoch": 0.20682398917781453, "flos": 17274577328640.0, "grad_norm": 1.9537104709510729, "language_loss": 0.83907467, "learning_rate": 3.6808781155739014e-06, "loss": 0.86097592, "num_input_tokens_seen": 74347335, "step": 3440, "time_per_iteration": 2.6949758529663086 }, { "auxiliary_loss_clip": 0.01147739, "auxiliary_loss_mlp": 0.01041939, "balance_loss_clip": 1.05509627, "balance_loss_mlp": 1.02458239, "epoch": 0.2068841124304825, "flos": 18077288515200.0, "grad_norm": 1.8008884636634683, "language_loss": 0.84828413, "learning_rate": 3.6806670320559614e-06, "loss": 0.8701809, "num_input_tokens_seen": 74366310, "step": 3441, "time_per_iteration": 2.6440463066101074 }, { "auxiliary_loss_clip": 0.01110175, "auxiliary_loss_mlp": 0.01048552, "balance_loss_clip": 1.05599904, "balance_loss_mlp": 1.03050399, "epoch": 0.20694423568315046, "flos": 27347017432320.0, "grad_norm": 1.7415147413468661, "language_loss": 0.85854685, "learning_rate": 3.680455884806959e-06, "loss": 0.88013411, "num_input_tokens_seen": 74387100, "step": 3442, "time_per_iteration": 2.8222689628601074 }, { "auxiliary_loss_clip": 0.01078025, "auxiliary_loss_mlp": 0.01050799, "balance_loss_clip": 1.05186844, "balance_loss_mlp": 1.03095019, "epoch": 0.20700435893581842, "flos": 20229845829120.0, "grad_norm": 1.9775081815037283, "language_loss": 0.73038852, "learning_rate": 3.6802446738349014e-06, "loss": 0.75167674, "num_input_tokens_seen": 74404460, "step": 3443, "time_per_iteration": 2.8044140338897705 }, { "auxiliary_loss_clip": 0.01127625, "auxiliary_loss_mlp": 0.00776303, "balance_loss_clip": 1.05408895, "balance_loss_mlp": 1.00079513, "epoch": 0.2070644821884864, "flos": 20631111638400.0, "grad_norm": 1.84636320729986, "language_loss": 0.85586846, "learning_rate": 3.680033399147797e-06, "loss": 0.87490773, "num_input_tokens_seen": 74423790, "step": 3444, "time_per_iteration": 2.7582647800445557 }, { "auxiliary_loss_clip": 0.01036759, "auxiliary_loss_mlp": 0.01007145, "balance_loss_clip": 1.03905272, "balance_loss_mlp": 1.0042963, "epoch": 0.20712460544115438, "flos": 65941077617280.0, "grad_norm": 0.6999396122177431, "language_loss": 0.57092249, "learning_rate": 3.6798220607536585e-06, "loss": 0.59136152, "num_input_tokens_seen": 74488130, "step": 3445, "time_per_iteration": 3.249602794647217 }, { "auxiliary_loss_clip": 0.01152738, "auxiliary_loss_mlp": 0.00776634, "balance_loss_clip": 1.0538106, "balance_loss_mlp": 1.00088191, "epoch": 0.20718472869382235, "flos": 19425734012160.0, "grad_norm": 1.6453630130444594, "language_loss": 0.78469276, "learning_rate": 3.6796106586604987e-06, "loss": 0.80398649, "num_input_tokens_seen": 74506720, "step": 3446, "time_per_iteration": 2.6341898441314697 }, { "auxiliary_loss_clip": 0.01151445, "auxiliary_loss_mlp": 0.01043774, "balance_loss_clip": 1.05439711, "balance_loss_mlp": 1.02297151, "epoch": 0.2072448519464903, "flos": 24499049834880.0, "grad_norm": 2.013256457797304, "language_loss": 0.63031304, "learning_rate": 3.679399192876334e-06, "loss": 0.65226525, "num_input_tokens_seen": 74525330, "step": 3447, "time_per_iteration": 2.6912922859191895 }, { "auxiliary_loss_clip": 0.01103828, "auxiliary_loss_mlp": 0.01058453, "balance_loss_clip": 1.04668319, "balance_loss_mlp": 1.03828287, "epoch": 0.20730497519915828, "flos": 23075694524160.0, "grad_norm": 1.7423220349735584, "language_loss": 0.86291325, "learning_rate": 3.679187663409184e-06, "loss": 0.88453603, "num_input_tokens_seen": 74544535, "step": 3448, "time_per_iteration": 2.787576675415039 }, { "auxiliary_loss_clip": 0.01128629, "auxiliary_loss_mlp": 0.01045151, "balance_loss_clip": 1.049932, "balance_loss_mlp": 1.02556467, "epoch": 0.20736509845182624, "flos": 21069042255360.0, "grad_norm": 3.8253504349982044, "language_loss": 0.75264204, "learning_rate": 3.6789760702670696e-06, "loss": 0.77437979, "num_input_tokens_seen": 74562300, "step": 3449, "time_per_iteration": 4.354467391967773 }, { "auxiliary_loss_clip": 0.01141162, "auxiliary_loss_mlp": 0.01050212, "balance_loss_clip": 1.0534308, "balance_loss_mlp": 1.03073323, "epoch": 0.2074252217044942, "flos": 17633288499840.0, "grad_norm": 2.156163289660715, "language_loss": 0.76558924, "learning_rate": 3.6787644134580134e-06, "loss": 0.787503, "num_input_tokens_seen": 74580080, "step": 3450, "time_per_iteration": 2.7020533084869385 }, { "auxiliary_loss_clip": 0.01128554, "auxiliary_loss_mlp": 0.01044182, "balance_loss_clip": 1.05234683, "balance_loss_mlp": 1.02522802, "epoch": 0.20748534495716217, "flos": 23546985897600.0, "grad_norm": 1.6446708221415856, "language_loss": 0.82074821, "learning_rate": 3.6785526929900436e-06, "loss": 0.84247565, "num_input_tokens_seen": 74598980, "step": 3451, "time_per_iteration": 2.7753186225891113 }, { "auxiliary_loss_clip": 0.01064426, "auxiliary_loss_mlp": 0.01003577, "balance_loss_clip": 1.02722275, "balance_loss_mlp": 1.00099015, "epoch": 0.20754546820983016, "flos": 52252935598080.0, "grad_norm": 0.793594031040259, "language_loss": 0.56562752, "learning_rate": 3.6783409088711875e-06, "loss": 0.58630753, "num_input_tokens_seen": 74655275, "step": 3452, "time_per_iteration": 6.257205963134766 }, { "auxiliary_loss_clip": 0.01124123, "auxiliary_loss_mlp": 0.00776806, "balance_loss_clip": 1.05206704, "balance_loss_mlp": 1.0008918, "epoch": 0.20760559146249813, "flos": 20412379768320.0, "grad_norm": 2.245823129763223, "language_loss": 0.88341558, "learning_rate": 3.6781290611094755e-06, "loss": 0.90242493, "num_input_tokens_seen": 74674560, "step": 3453, "time_per_iteration": 2.7009050846099854 }, { "auxiliary_loss_clip": 0.01146287, "auxiliary_loss_mlp": 0.01044217, "balance_loss_clip": 1.05471313, "balance_loss_mlp": 1.02521539, "epoch": 0.2076657147151661, "flos": 23186012169600.0, "grad_norm": 2.2325669459725574, "language_loss": 0.79920429, "learning_rate": 3.6779171497129407e-06, "loss": 0.82110935, "num_input_tokens_seen": 74694500, "step": 3454, "time_per_iteration": 2.7080893516540527 }, { "auxiliary_loss_clip": 0.01104984, "auxiliary_loss_mlp": 0.00777717, "balance_loss_clip": 1.04356718, "balance_loss_mlp": 1.0007751, "epoch": 0.20772583796783406, "flos": 18293219124480.0, "grad_norm": 3.601668384502942, "language_loss": 0.76601356, "learning_rate": 3.6777051746896202e-06, "loss": 0.78484058, "num_input_tokens_seen": 74710485, "step": 3455, "time_per_iteration": 2.6733248233795166 }, { "auxiliary_loss_clip": 0.01115407, "auxiliary_loss_mlp": 0.01050321, "balance_loss_clip": 1.04759336, "balance_loss_mlp": 1.0326066, "epoch": 0.20778596122050202, "flos": 17602800831360.0, "grad_norm": 1.908671081537558, "language_loss": 0.80200219, "learning_rate": 3.6774931360475516e-06, "loss": 0.82365942, "num_input_tokens_seen": 74727450, "step": 3456, "time_per_iteration": 2.6950278282165527 }, { "auxiliary_loss_clip": 0.01112832, "auxiliary_loss_mlp": 0.00777675, "balance_loss_clip": 1.05166578, "balance_loss_mlp": 1.00099969, "epoch": 0.20784608447317, "flos": 23805578885760.0, "grad_norm": 2.135320694722552, "language_loss": 0.78070557, "learning_rate": 3.6772810337947745e-06, "loss": 0.79961067, "num_input_tokens_seen": 74746725, "step": 3457, "time_per_iteration": 4.381137132644653 }, { "auxiliary_loss_clip": 0.01082177, "auxiliary_loss_mlp": 0.01058291, "balance_loss_clip": 1.04310393, "balance_loss_mlp": 1.03651094, "epoch": 0.20790620772583795, "flos": 17639286071040.0, "grad_norm": 1.7652855773158553, "language_loss": 0.8360287, "learning_rate": 3.677068867939333e-06, "loss": 0.85743344, "num_input_tokens_seen": 74765255, "step": 3458, "time_per_iteration": 2.7332653999328613 }, { "auxiliary_loss_clip": 0.01140275, "auxiliary_loss_mlp": 0.0077698, "balance_loss_clip": 1.05156302, "balance_loss_mlp": 1.00095606, "epoch": 0.20796633097850595, "flos": 27673481168640.0, "grad_norm": 11.883071119862361, "language_loss": 0.75769317, "learning_rate": 3.676856638489272e-06, "loss": 0.77686572, "num_input_tokens_seen": 74785710, "step": 3459, "time_per_iteration": 2.705026626586914 }, { "auxiliary_loss_clip": 0.01089168, "auxiliary_loss_mlp": 0.01038825, "balance_loss_clip": 1.04769015, "balance_loss_mlp": 1.02081251, "epoch": 0.2080264542311739, "flos": 19245606284160.0, "grad_norm": 2.1071303009051428, "language_loss": 0.77105331, "learning_rate": 3.6766443454526382e-06, "loss": 0.79233319, "num_input_tokens_seen": 74804490, "step": 3460, "time_per_iteration": 2.749965190887451 }, { "auxiliary_loss_clip": 0.0109477, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.04938984, "balance_loss_mlp": 1.02838707, "epoch": 0.20808657748384188, "flos": 27525924097920.0, "grad_norm": 9.5480036120023, "language_loss": 0.75802225, "learning_rate": 3.6764319888374836e-06, "loss": 0.77944589, "num_input_tokens_seen": 74826340, "step": 3461, "time_per_iteration": 2.7929086685180664 }, { "auxiliary_loss_clip": 0.01124748, "auxiliary_loss_mlp": 0.01041543, "balance_loss_clip": 1.04610133, "balance_loss_mlp": 1.02203989, "epoch": 0.20814670073650984, "flos": 26906931999360.0, "grad_norm": 2.001927586001653, "language_loss": 0.8848443, "learning_rate": 3.6762195686518604e-06, "loss": 0.90650725, "num_input_tokens_seen": 74844960, "step": 3462, "time_per_iteration": 2.7031619548797607 }, { "auxiliary_loss_clip": 0.01023861, "auxiliary_loss_mlp": 0.00757905, "balance_loss_clip": 1.02540636, "balance_loss_mlp": 1.00168896, "epoch": 0.2082068239891778, "flos": 70175735717760.0, "grad_norm": 0.7622558664505636, "language_loss": 0.59010452, "learning_rate": 3.6760070849038226e-06, "loss": 0.6079222, "num_input_tokens_seen": 74909075, "step": 3463, "time_per_iteration": 3.4111485481262207 }, { "auxiliary_loss_clip": 0.01132553, "auxiliary_loss_mlp": 0.01047591, "balance_loss_clip": 1.04893148, "balance_loss_mlp": 1.02866018, "epoch": 0.20826694724184577, "flos": 24608074590720.0, "grad_norm": 2.6002828602708283, "language_loss": 0.66744608, "learning_rate": 3.675794537601429e-06, "loss": 0.68924749, "num_input_tokens_seen": 74928125, "step": 3464, "time_per_iteration": 2.718229293823242 }, { "auxiliary_loss_clip": 0.0112374, "auxiliary_loss_mlp": 0.0104712, "balance_loss_clip": 1.05101657, "balance_loss_mlp": 1.02755797, "epoch": 0.20832707049451377, "flos": 12892829034240.0, "grad_norm": 2.9384916482598205, "language_loss": 0.84044278, "learning_rate": 3.6755819267527373e-06, "loss": 0.86215138, "num_input_tokens_seen": 74945090, "step": 3465, "time_per_iteration": 2.732109546661377 }, { "auxiliary_loss_clip": 0.01096712, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.04373813, "balance_loss_mlp": 1.02221096, "epoch": 0.20838719374718173, "flos": 22198827709440.0, "grad_norm": 2.576139197384499, "language_loss": 0.81923312, "learning_rate": 3.6753692523658113e-06, "loss": 0.84060633, "num_input_tokens_seen": 74963630, "step": 3466, "time_per_iteration": 2.7758567333221436 }, { "auxiliary_loss_clip": 0.01140158, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.05322194, "balance_loss_mlp": 1.02787983, "epoch": 0.2084473169998497, "flos": 15158648908800.0, "grad_norm": 4.780862188541671, "language_loss": 0.82008922, "learning_rate": 3.675156514448716e-06, "loss": 0.84193271, "num_input_tokens_seen": 74981875, "step": 3467, "time_per_iteration": 2.5788159370422363 }, { "auxiliary_loss_clip": 0.01149826, "auxiliary_loss_mlp": 0.01040027, "balance_loss_clip": 1.05362797, "balance_loss_mlp": 1.02265835, "epoch": 0.20850744025251766, "flos": 17456788045440.0, "grad_norm": 2.009157691583003, "language_loss": 0.82178962, "learning_rate": 3.674943713009518e-06, "loss": 0.84368813, "num_input_tokens_seen": 74999155, "step": 3468, "time_per_iteration": 2.5874218940734863 }, { "auxiliary_loss_clip": 0.01143942, "auxiliary_loss_mlp": 0.01048537, "balance_loss_clip": 1.05300629, "balance_loss_mlp": 1.02774715, "epoch": 0.20856756350518563, "flos": 25698968593920.0, "grad_norm": 2.0793964386868584, "language_loss": 0.90328556, "learning_rate": 3.6747308480562856e-06, "loss": 0.92521036, "num_input_tokens_seen": 75017850, "step": 3469, "time_per_iteration": 2.6595447063446045 }, { "auxiliary_loss_clip": 0.01125181, "auxiliary_loss_mlp": 0.0104984, "balance_loss_clip": 1.05548537, "balance_loss_mlp": 1.03175592, "epoch": 0.2086276867578536, "flos": 37889060970240.0, "grad_norm": 1.9058635967771913, "language_loss": 0.76809812, "learning_rate": 3.674517919597092e-06, "loss": 0.78984833, "num_input_tokens_seen": 75039270, "step": 3470, "time_per_iteration": 2.908046245574951 }, { "auxiliary_loss_clip": 0.01133446, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.0551517, "balance_loss_mlp": 1.02942634, "epoch": 0.20868781001052156, "flos": 25557049958400.0, "grad_norm": 2.301093296435647, "language_loss": 0.75801277, "learning_rate": 3.674304927640011e-06, "loss": 0.77982342, "num_input_tokens_seen": 75059350, "step": 3471, "time_per_iteration": 2.713533401489258 }, { "auxiliary_loss_clip": 0.01123818, "auxiliary_loss_mlp": 0.01053513, "balance_loss_clip": 1.04961812, "balance_loss_mlp": 1.03384328, "epoch": 0.20874793326318955, "flos": 27529192235520.0, "grad_norm": 2.366290140730035, "language_loss": 0.75703716, "learning_rate": 3.67409187219312e-06, "loss": 0.77881044, "num_input_tokens_seen": 75080150, "step": 3472, "time_per_iteration": 2.785034656524658 }, { "auxiliary_loss_clip": 0.01140589, "auxiliary_loss_mlp": 0.01046494, "balance_loss_clip": 1.05084538, "balance_loss_mlp": 1.02854145, "epoch": 0.20880805651585752, "flos": 18548795370240.0, "grad_norm": 7.277377921302429, "language_loss": 0.84276807, "learning_rate": 3.6738787532644966e-06, "loss": 0.86463886, "num_input_tokens_seen": 75097920, "step": 3473, "time_per_iteration": 2.6236281394958496 }, { "auxiliary_loss_clip": 0.01057043, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.05363917, "balance_loss_mlp": 1.03434241, "epoch": 0.20886817976852548, "flos": 65946644225280.0, "grad_norm": 0.9045809123115837, "language_loss": 0.63652557, "learning_rate": 3.6736655708622235e-06, "loss": 0.65747303, "num_input_tokens_seen": 75152410, "step": 3474, "time_per_iteration": 3.1946537494659424 }, { "auxiliary_loss_clip": 0.0113535, "auxiliary_loss_mlp": 0.01045984, "balance_loss_clip": 1.05276895, "balance_loss_mlp": 1.02782845, "epoch": 0.20892830302119345, "flos": 36539178929280.0, "grad_norm": 3.2311626254468795, "language_loss": 0.69970965, "learning_rate": 3.6734523249943844e-06, "loss": 0.72152305, "num_input_tokens_seen": 75173265, "step": 3475, "time_per_iteration": 2.7967529296875 }, { "auxiliary_loss_clip": 0.01158022, "auxiliary_loss_mlp": 0.01046944, "balance_loss_clip": 1.05606794, "balance_loss_mlp": 1.02862167, "epoch": 0.2089884262738614, "flos": 20956749361920.0, "grad_norm": 1.9789108228051473, "language_loss": 0.70372891, "learning_rate": 3.673239015669065e-06, "loss": 0.72577858, "num_input_tokens_seen": 75193640, "step": 3476, "time_per_iteration": 2.629687786102295 }, { "auxiliary_loss_clip": 0.01131765, "auxiliary_loss_mlp": 0.01045236, "balance_loss_clip": 1.05439556, "balance_loss_mlp": 1.02722347, "epoch": 0.20904854952652938, "flos": 22784028088320.0, "grad_norm": 2.3868812434184603, "language_loss": 0.89227062, "learning_rate": 3.6730256428943544e-06, "loss": 0.91404068, "num_input_tokens_seen": 75212545, "step": 3477, "time_per_iteration": 2.7574357986450195 }, { "auxiliary_loss_clip": 0.01092922, "auxiliary_loss_mlp": 0.01046119, "balance_loss_clip": 1.045825, "balance_loss_mlp": 1.02737951, "epoch": 0.20910867277919734, "flos": 27303277645440.0, "grad_norm": 2.6092415644893814, "language_loss": 0.67816859, "learning_rate": 3.672812206678344e-06, "loss": 0.69955903, "num_input_tokens_seen": 75230865, "step": 3478, "time_per_iteration": 2.7929017543792725 }, { "auxiliary_loss_clip": 0.01094689, "auxiliary_loss_mlp": 0.01042766, "balance_loss_clip": 1.04024661, "balance_loss_mlp": 1.02308464, "epoch": 0.20916879603186533, "flos": 14319237000960.0, "grad_norm": 4.056245481336458, "language_loss": 0.84239435, "learning_rate": 3.672598707029127e-06, "loss": 0.86376888, "num_input_tokens_seen": 75248285, "step": 3479, "time_per_iteration": 2.743544816970825 }, { "auxiliary_loss_clip": 0.01111533, "auxiliary_loss_mlp": 0.01050991, "balance_loss_clip": 1.04863191, "balance_loss_mlp": 1.03028417, "epoch": 0.2092289192845333, "flos": 22273019251200.0, "grad_norm": 9.599906344578406, "language_loss": 0.74294043, "learning_rate": 3.6723851439548003e-06, "loss": 0.76456571, "num_input_tokens_seen": 75266310, "step": 3480, "time_per_iteration": 2.7278034687042236 }, { "auxiliary_loss_clip": 0.01107791, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.04748154, "balance_loss_mlp": 1.02226901, "epoch": 0.20928904253720126, "flos": 14830712714880.0, "grad_norm": 2.178942595840573, "language_loss": 0.75664043, "learning_rate": 3.67217151746346e-06, "loss": 0.77810597, "num_input_tokens_seen": 75284175, "step": 3481, "time_per_iteration": 2.71073842048645 }, { "auxiliary_loss_clip": 0.01090021, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.04561555, "balance_loss_mlp": 1.02727938, "epoch": 0.20934916578986923, "flos": 23259162216960.0, "grad_norm": 1.816378391984801, "language_loss": 0.8517971, "learning_rate": 3.671957827563209e-06, "loss": 0.87315965, "num_input_tokens_seen": 75303465, "step": 3482, "time_per_iteration": 2.8777174949645996 }, { "auxiliary_loss_clip": 0.01099298, "auxiliary_loss_mlp": 0.01046228, "balance_loss_clip": 1.05039477, "balance_loss_mlp": 1.02817941, "epoch": 0.2094092890425372, "flos": 32014398677760.0, "grad_norm": 1.802490425012806, "language_loss": 0.70550174, "learning_rate": 3.6717440742621494e-06, "loss": 0.72695696, "num_input_tokens_seen": 75325290, "step": 3483, "time_per_iteration": 2.8599836826324463 }, { "auxiliary_loss_clip": 0.01127333, "auxiliary_loss_mlp": 0.01048954, "balance_loss_clip": 1.05204535, "balance_loss_mlp": 1.03082263, "epoch": 0.20946941229520516, "flos": 20010647082240.0, "grad_norm": 1.9649551735344426, "language_loss": 0.74867833, "learning_rate": 3.6715302575683865e-06, "loss": 0.77044123, "num_input_tokens_seen": 75343895, "step": 3484, "time_per_iteration": 2.655538320541382 }, { "auxiliary_loss_clip": 0.01117623, "auxiliary_loss_mlp": 0.01046902, "balance_loss_clip": 1.0514648, "balance_loss_mlp": 1.0274353, "epoch": 0.20952953554787315, "flos": 30740072895360.0, "grad_norm": 1.6308141537991403, "language_loss": 0.70815694, "learning_rate": 3.6713163774900292e-06, "loss": 0.72980225, "num_input_tokens_seen": 75367100, "step": 3485, "time_per_iteration": 2.744417667388916 }, { "auxiliary_loss_clip": 0.01083098, "auxiliary_loss_mlp": 0.00777163, "balance_loss_clip": 1.0433619, "balance_loss_mlp": 1.00097859, "epoch": 0.20958965880054112, "flos": 27049209770880.0, "grad_norm": 2.030771632516388, "language_loss": 0.83274543, "learning_rate": 3.6711024340351875e-06, "loss": 0.85134804, "num_input_tokens_seen": 75389925, "step": 3486, "time_per_iteration": 2.742042303085327 }, { "auxiliary_loss_clip": 0.01140212, "auxiliary_loss_mlp": 0.01048337, "balance_loss_clip": 1.05242062, "balance_loss_mlp": 1.03115916, "epoch": 0.20964978205320908, "flos": 34204123589760.0, "grad_norm": 1.6926372989653347, "language_loss": 0.87134725, "learning_rate": 3.6708884272119737e-06, "loss": 0.89323276, "num_input_tokens_seen": 75408575, "step": 3487, "time_per_iteration": 2.708331346511841 }, { "auxiliary_loss_clip": 0.01112214, "auxiliary_loss_mlp": 0.01041678, "balance_loss_clip": 1.04791641, "balance_loss_mlp": 1.0228194, "epoch": 0.20970990530587705, "flos": 23477391296640.0, "grad_norm": 4.471143750410675, "language_loss": 0.72291327, "learning_rate": 3.670674357028504e-06, "loss": 0.74445224, "num_input_tokens_seen": 75427155, "step": 3488, "time_per_iteration": 4.250715970993042 }, { "auxiliary_loss_clip": 0.01121403, "auxiliary_loss_mlp": 0.01037296, "balance_loss_clip": 1.05096245, "balance_loss_mlp": 1.02014148, "epoch": 0.209770028558545, "flos": 18551452976640.0, "grad_norm": 2.6694226497987437, "language_loss": 0.79665899, "learning_rate": 3.6704602234928945e-06, "loss": 0.81824595, "num_input_tokens_seen": 75444450, "step": 3489, "time_per_iteration": 2.6926958560943604 }, { "auxiliary_loss_clip": 0.01152639, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.05325401, "balance_loss_mlp": 1.02875018, "epoch": 0.20983015181121298, "flos": 21617003208960.0, "grad_norm": 2.022409198347131, "language_loss": 0.72505707, "learning_rate": 3.670246026613266e-06, "loss": 0.74704129, "num_input_tokens_seen": 75462625, "step": 3490, "time_per_iteration": 4.133761644363403 }, { "auxiliary_loss_clip": 0.01122247, "auxiliary_loss_mlp": 0.01050283, "balance_loss_clip": 1.0509479, "balance_loss_mlp": 1.03402328, "epoch": 0.20989027506388094, "flos": 16614718531200.0, "grad_norm": 1.8035978449536252, "language_loss": 0.70332754, "learning_rate": 3.6700317663977415e-06, "loss": 0.72505283, "num_input_tokens_seen": 75480640, "step": 3491, "time_per_iteration": 2.667243003845215 }, { "auxiliary_loss_clip": 0.0113848, "auxiliary_loss_mlp": 0.0077627, "balance_loss_clip": 1.05017376, "balance_loss_mlp": 1.00098944, "epoch": 0.20995039831654894, "flos": 23216823060480.0, "grad_norm": 2.379943808529104, "language_loss": 0.79751909, "learning_rate": 3.669817442854444e-06, "loss": 0.81666666, "num_input_tokens_seen": 75494900, "step": 3492, "time_per_iteration": 4.270704984664917 }, { "auxiliary_loss_clip": 0.01138825, "auxiliary_loss_mlp": 0.00776339, "balance_loss_clip": 1.05182219, "balance_loss_mlp": 1.00108409, "epoch": 0.2100105215692169, "flos": 18147493647360.0, "grad_norm": 2.2783194747149906, "language_loss": 0.86987948, "learning_rate": 3.669603055991502e-06, "loss": 0.88903111, "num_input_tokens_seen": 75513370, "step": 3493, "time_per_iteration": 2.7830448150634766 }, { "auxiliary_loss_clip": 0.01110786, "auxiliary_loss_mlp": 0.01037681, "balance_loss_clip": 1.04520118, "balance_loss_mlp": 1.02105093, "epoch": 0.21007064482188487, "flos": 15961611490560.0, "grad_norm": 6.813030650079402, "language_loss": 0.68622243, "learning_rate": 3.6693886058170455e-06, "loss": 0.70770705, "num_input_tokens_seen": 75532480, "step": 3494, "time_per_iteration": 2.8479061126708984 }, { "auxiliary_loss_clip": 0.01145467, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.05302739, "balance_loss_mlp": 1.01998639, "epoch": 0.21013076807455283, "flos": 32234315696640.0, "grad_norm": 1.7516454579581615, "language_loss": 0.78848761, "learning_rate": 3.6691740923392053e-06, "loss": 0.81031501, "num_input_tokens_seen": 75552745, "step": 3495, "time_per_iteration": 2.9313197135925293 }, { "auxiliary_loss_clip": 0.01119614, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.04760814, "balance_loss_mlp": 1.02708316, "epoch": 0.2101908913272208, "flos": 23696625957120.0, "grad_norm": 2.1492916784611844, "language_loss": 0.77302933, "learning_rate": 3.668959515566116e-06, "loss": 0.79467654, "num_input_tokens_seen": 75574355, "step": 3496, "time_per_iteration": 4.467881441116333 }, { "auxiliary_loss_clip": 0.01135202, "auxiliary_loss_mlp": 0.01046618, "balance_loss_clip": 1.05169654, "balance_loss_mlp": 1.02839065, "epoch": 0.21025101457988876, "flos": 20375786787840.0, "grad_norm": 2.146148958862047, "language_loss": 0.82076812, "learning_rate": 3.668744875505915e-06, "loss": 0.8425864, "num_input_tokens_seen": 75592215, "step": 3497, "time_per_iteration": 2.683037281036377 }, { "auxiliary_loss_clip": 0.01144559, "auxiliary_loss_mlp": 0.01047188, "balance_loss_clip": 1.05445957, "balance_loss_mlp": 1.02967596, "epoch": 0.21031113783255675, "flos": 25775638174080.0, "grad_norm": 1.732381679276629, "language_loss": 0.67239833, "learning_rate": 3.668530172166741e-06, "loss": 0.69431579, "num_input_tokens_seen": 75610740, "step": 3498, "time_per_iteration": 2.685481548309326 }, { "auxiliary_loss_clip": 0.01121255, "auxiliary_loss_mlp": 0.01044553, "balance_loss_clip": 1.04974794, "balance_loss_mlp": 1.02611172, "epoch": 0.21037126108522472, "flos": 22018197191040.0, "grad_norm": 1.7892967196850054, "language_loss": 0.80832362, "learning_rate": 3.6683154055567352e-06, "loss": 0.82998168, "num_input_tokens_seen": 75631005, "step": 3499, "time_per_iteration": 2.744995355606079 }, { "auxiliary_loss_clip": 0.01139753, "auxiliary_loss_mlp": 0.01039729, "balance_loss_clip": 1.05226696, "balance_loss_mlp": 1.02312946, "epoch": 0.21043138433789269, "flos": 25334403505920.0, "grad_norm": 1.6464696881852638, "language_loss": 0.77983701, "learning_rate": 3.668100575684043e-06, "loss": 0.80163181, "num_input_tokens_seen": 75650655, "step": 3500, "time_per_iteration": 2.7704038619995117 }, { "auxiliary_loss_clip": 0.01129369, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.05095315, "balance_loss_mlp": 1.02390063, "epoch": 0.21049150759056065, "flos": 25556654908800.0, "grad_norm": 1.5981262394728393, "language_loss": 0.74450207, "learning_rate": 3.6678856825568094e-06, "loss": 0.76621759, "num_input_tokens_seen": 75669895, "step": 3501, "time_per_iteration": 2.7066893577575684 }, { "auxiliary_loss_clip": 0.01134924, "auxiliary_loss_mlp": 0.01039556, "balance_loss_clip": 1.04989994, "balance_loss_mlp": 1.02227044, "epoch": 0.21055163084322862, "flos": 24495602129280.0, "grad_norm": 1.6188770382514572, "language_loss": 0.75278366, "learning_rate": 3.667670726183183e-06, "loss": 0.77452844, "num_input_tokens_seen": 75689535, "step": 3502, "time_per_iteration": 2.724635124206543 }, { "auxiliary_loss_clip": 0.01098479, "auxiliary_loss_mlp": 0.01040924, "balance_loss_clip": 1.04576206, "balance_loss_mlp": 1.02248216, "epoch": 0.21061175409589658, "flos": 25739045193600.0, "grad_norm": 1.9441266701933382, "language_loss": 0.77188909, "learning_rate": 3.667455706571316e-06, "loss": 0.7932831, "num_input_tokens_seen": 75709265, "step": 3503, "time_per_iteration": 2.7545289993286133 }, { "auxiliary_loss_clip": 0.010957, "auxiliary_loss_mlp": 0.01045911, "balance_loss_clip": 1.04817343, "balance_loss_mlp": 1.02478695, "epoch": 0.21067187734856455, "flos": 18989168112000.0, "grad_norm": 2.256374081289255, "language_loss": 0.78297234, "learning_rate": 3.6672406237293617e-06, "loss": 0.8043884, "num_input_tokens_seen": 75727050, "step": 3504, "time_per_iteration": 2.7454304695129395 }, { "auxiliary_loss_clip": 0.01117408, "auxiliary_loss_mlp": 0.01049815, "balance_loss_clip": 1.0488404, "balance_loss_mlp": 1.03152788, "epoch": 0.21073200060123254, "flos": 24681368292480.0, "grad_norm": 1.5753219052286964, "language_loss": 0.76731002, "learning_rate": 3.6670254776654754e-06, "loss": 0.78898227, "num_input_tokens_seen": 75747175, "step": 3505, "time_per_iteration": 2.7509703636169434 }, { "auxiliary_loss_clip": 0.01120291, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.04882348, "balance_loss_mlp": 1.03383446, "epoch": 0.2107921238539005, "flos": 28549342402560.0, "grad_norm": 1.9938386598136906, "language_loss": 0.63933277, "learning_rate": 3.6668102683878163e-06, "loss": 0.66105598, "num_input_tokens_seen": 75767690, "step": 3506, "time_per_iteration": 2.773611545562744 }, { "auxiliary_loss_clip": 0.01138444, "auxiliary_loss_mlp": 0.01050655, "balance_loss_clip": 1.05078697, "balance_loss_mlp": 1.03257108, "epoch": 0.21085224710656847, "flos": 25885848078720.0, "grad_norm": 2.170999698474249, "language_loss": 0.82010436, "learning_rate": 3.6665949959045443e-06, "loss": 0.84199536, "num_input_tokens_seen": 75787255, "step": 3507, "time_per_iteration": 2.6604206562042236 }, { "auxiliary_loss_clip": 0.01136754, "auxiliary_loss_mlp": 0.01043314, "balance_loss_clip": 1.04972744, "balance_loss_mlp": 1.02472949, "epoch": 0.21091237035923643, "flos": 14976294537600.0, "grad_norm": 2.0519706535557414, "language_loss": 0.75213134, "learning_rate": 3.666379660223824e-06, "loss": 0.77393204, "num_input_tokens_seen": 75805890, "step": 3508, "time_per_iteration": 2.7164604663848877 }, { "auxiliary_loss_clip": 0.01154655, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.05263913, "balance_loss_mlp": 1.01894128, "epoch": 0.2109724936119044, "flos": 16362518163840.0, "grad_norm": 3.4182125548434112, "language_loss": 0.84984946, "learning_rate": 3.6661642613538192e-06, "loss": 0.87176406, "num_input_tokens_seen": 75821620, "step": 3509, "time_per_iteration": 2.661743402481079 }, { "auxiliary_loss_clip": 0.01120944, "auxiliary_loss_mlp": 0.01044014, "balance_loss_clip": 1.05299115, "balance_loss_mlp": 1.02443957, "epoch": 0.21103261686457236, "flos": 31502492000640.0, "grad_norm": 2.210880078691599, "language_loss": 0.68125075, "learning_rate": 3.6659487993026987e-06, "loss": 0.70290035, "num_input_tokens_seen": 75842490, "step": 3510, "time_per_iteration": 2.7881460189819336 }, { "auxiliary_loss_clip": 0.01152569, "auxiliary_loss_mlp": 0.01046993, "balance_loss_clip": 1.05026078, "balance_loss_mlp": 1.02892137, "epoch": 0.21109274011724033, "flos": 27344072517120.0, "grad_norm": 1.958863999940011, "language_loss": 0.72639364, "learning_rate": 3.6657332740786327e-06, "loss": 0.74838924, "num_input_tokens_seen": 75865985, "step": 3511, "time_per_iteration": 2.6942689418792725 }, { "auxiliary_loss_clip": 0.01066393, "auxiliary_loss_mlp": 0.01041278, "balance_loss_clip": 1.04279399, "balance_loss_mlp": 1.0208931, "epoch": 0.21115286336990832, "flos": 17820383466240.0, "grad_norm": 3.2801391377369686, "language_loss": 0.69354337, "learning_rate": 3.665517685689794e-06, "loss": 0.71462011, "num_input_tokens_seen": 75882745, "step": 3512, "time_per_iteration": 2.8260998725891113 }, { "auxiliary_loss_clip": 0.01140043, "auxiliary_loss_mlp": 0.01050555, "balance_loss_clip": 1.04943943, "balance_loss_mlp": 1.03082585, "epoch": 0.2112129866225763, "flos": 27197987904000.0, "grad_norm": 2.072678482519775, "language_loss": 0.73145646, "learning_rate": 3.6653020341443584e-06, "loss": 0.75336242, "num_input_tokens_seen": 75904305, "step": 3513, "time_per_iteration": 2.9639391899108887 }, { "auxiliary_loss_clip": 0.01121964, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.04785061, "balance_loss_mlp": 1.02089679, "epoch": 0.21127310987524425, "flos": 23731279603200.0, "grad_norm": 2.0322171916220086, "language_loss": 0.74422491, "learning_rate": 3.665086319450502e-06, "loss": 0.76582778, "num_input_tokens_seen": 75923710, "step": 3514, "time_per_iteration": 2.7379143238067627 }, { "auxiliary_loss_clip": 0.01136944, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.05334568, "balance_loss_mlp": 1.01941383, "epoch": 0.21133323312791222, "flos": 18332505624960.0, "grad_norm": 2.431934297389972, "language_loss": 0.76738697, "learning_rate": 3.6648705416164062e-06, "loss": 0.78913867, "num_input_tokens_seen": 75942625, "step": 3515, "time_per_iteration": 2.6339287757873535 }, { "auxiliary_loss_clip": 0.011289, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.05247736, "balance_loss_mlp": 1.0288614, "epoch": 0.21139335638058018, "flos": 17931203902080.0, "grad_norm": 2.7460645413082756, "language_loss": 0.68756706, "learning_rate": 3.6646547006502518e-06, "loss": 0.70933092, "num_input_tokens_seen": 75959930, "step": 3516, "time_per_iteration": 2.6489672660827637 }, { "auxiliary_loss_clip": 0.01118182, "auxiliary_loss_mlp": 0.01049447, "balance_loss_clip": 1.05634522, "balance_loss_mlp": 1.03045666, "epoch": 0.21145347963324815, "flos": 24572092141440.0, "grad_norm": 1.8368744753927078, "language_loss": 0.85010064, "learning_rate": 3.664438796560225e-06, "loss": 0.87177694, "num_input_tokens_seen": 75980335, "step": 3517, "time_per_iteration": 2.745887279510498 }, { "auxiliary_loss_clip": 0.01125904, "auxiliary_loss_mlp": 0.01042813, "balance_loss_clip": 1.04719234, "balance_loss_mlp": 1.02506244, "epoch": 0.21151360288591614, "flos": 35845959375360.0, "grad_norm": 2.246330970109572, "language_loss": 0.63672101, "learning_rate": 3.664222829354512e-06, "loss": 0.65840822, "num_input_tokens_seen": 76002095, "step": 3518, "time_per_iteration": 2.7990219593048096 }, { "auxiliary_loss_clip": 0.01089367, "auxiliary_loss_mlp": 0.01057733, "balance_loss_clip": 1.05040181, "balance_loss_mlp": 1.04001832, "epoch": 0.2115737261385841, "flos": 24641579001600.0, "grad_norm": 2.1349107177710875, "language_loss": 0.89256221, "learning_rate": 3.664006799041303e-06, "loss": 0.91403317, "num_input_tokens_seen": 76020425, "step": 3519, "time_per_iteration": 2.8022944927215576 }, { "auxiliary_loss_clip": 0.01135146, "auxiliary_loss_mlp": 0.01049587, "balance_loss_clip": 1.05320001, "balance_loss_mlp": 1.03140712, "epoch": 0.21163384939125207, "flos": 25226887121280.0, "grad_norm": 1.8050755180524396, "language_loss": 0.81235015, "learning_rate": 3.6637907056287886e-06, "loss": 0.8341974, "num_input_tokens_seen": 76041210, "step": 3520, "time_per_iteration": 2.750988245010376 }, { "auxiliary_loss_clip": 0.01124406, "auxiliary_loss_mlp": 0.01048631, "balance_loss_clip": 1.05111551, "balance_loss_mlp": 1.03095269, "epoch": 0.21169397264392004, "flos": 26067520091520.0, "grad_norm": 1.92815865975435, "language_loss": 0.76254267, "learning_rate": 3.6635745491251642e-06, "loss": 0.78427303, "num_input_tokens_seen": 76062685, "step": 3521, "time_per_iteration": 2.7965810298919678 }, { "auxiliary_loss_clip": 0.0109789, "auxiliary_loss_mlp": 0.01044794, "balance_loss_clip": 1.04872918, "balance_loss_mlp": 1.02841413, "epoch": 0.211754095896588, "flos": 23108265181440.0, "grad_norm": 2.0270933567011302, "language_loss": 0.75752926, "learning_rate": 3.663358329538626e-06, "loss": 0.77895606, "num_input_tokens_seen": 76082300, "step": 3522, "time_per_iteration": 2.8280131816864014 }, { "auxiliary_loss_clip": 0.01153324, "auxiliary_loss_mlp": 0.01053431, "balance_loss_clip": 1.05353725, "balance_loss_mlp": 1.03541851, "epoch": 0.21181421914925597, "flos": 27922341571200.0, "grad_norm": 1.8399634756194385, "language_loss": 0.70481133, "learning_rate": 3.663142046877374e-06, "loss": 0.72687888, "num_input_tokens_seen": 76101135, "step": 3523, "time_per_iteration": 2.6909022331237793 }, { "auxiliary_loss_clip": 0.01139749, "auxiliary_loss_mlp": 0.01054127, "balance_loss_clip": 1.05166054, "balance_loss_mlp": 1.03619766, "epoch": 0.21187434240192393, "flos": 17128636369920.0, "grad_norm": 2.455264594190525, "language_loss": 0.77290082, "learning_rate": 3.6629257011496085e-06, "loss": 0.7948395, "num_input_tokens_seen": 76119320, "step": 3524, "time_per_iteration": 2.6844334602355957 }, { "auxiliary_loss_clip": 0.01132697, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.05066419, "balance_loss_mlp": 1.02621162, "epoch": 0.21193446565459192, "flos": 22347318533760.0, "grad_norm": 1.841652047976503, "language_loss": 0.81680572, "learning_rate": 3.6627092923635338e-06, "loss": 0.83857846, "num_input_tokens_seen": 76137445, "step": 3525, "time_per_iteration": 2.71073842048645 }, { "auxiliary_loss_clip": 0.01088536, "auxiliary_loss_mlp": 0.01041509, "balance_loss_clip": 1.04158318, "balance_loss_mlp": 1.02353263, "epoch": 0.2119945889072599, "flos": 27199316707200.0, "grad_norm": 1.867957043941215, "language_loss": 0.75627208, "learning_rate": 3.662492820527356e-06, "loss": 0.77757257, "num_input_tokens_seen": 76159500, "step": 3526, "time_per_iteration": 2.973966598510742 }, { "auxiliary_loss_clip": 0.0115455, "auxiliary_loss_mlp": 0.01041027, "balance_loss_clip": 1.05324817, "balance_loss_mlp": 1.023229, "epoch": 0.21205471215992786, "flos": 20991869884800.0, "grad_norm": 1.8230643924086412, "language_loss": 0.77070421, "learning_rate": 3.662276285649284e-06, "loss": 0.79265994, "num_input_tokens_seen": 76177990, "step": 3527, "time_per_iteration": 2.648961067199707 }, { "auxiliary_loss_clip": 0.01151081, "auxiliary_loss_mlp": 0.0104874, "balance_loss_clip": 1.05143785, "balance_loss_mlp": 1.02977419, "epoch": 0.21211483541259582, "flos": 20777663128320.0, "grad_norm": 2.807984733302778, "language_loss": 0.7815178, "learning_rate": 3.662059687737528e-06, "loss": 0.80351603, "num_input_tokens_seen": 76197125, "step": 3528, "time_per_iteration": 4.401185989379883 }, { "auxiliary_loss_clip": 0.01135768, "auxiliary_loss_mlp": 0.01045736, "balance_loss_clip": 1.04889631, "balance_loss_mlp": 1.02817655, "epoch": 0.21217495866526379, "flos": 18989994124800.0, "grad_norm": 2.1271435469609257, "language_loss": 0.8128866, "learning_rate": 3.6618430268003024e-06, "loss": 0.8347016, "num_input_tokens_seen": 76216215, "step": 3529, "time_per_iteration": 4.309772968292236 }, { "auxiliary_loss_clip": 0.0113319, "auxiliary_loss_mlp": 0.00777373, "balance_loss_clip": 1.04967499, "balance_loss_mlp": 1.00112891, "epoch": 0.21223508191793175, "flos": 20667309569280.0, "grad_norm": 1.9704727824538568, "language_loss": 0.76427567, "learning_rate": 3.6616263028458235e-06, "loss": 0.78338128, "num_input_tokens_seen": 76237010, "step": 3530, "time_per_iteration": 2.7592365741729736 }, { "auxiliary_loss_clip": 0.0115078, "auxiliary_loss_mlp": 0.01047067, "balance_loss_clip": 1.0522244, "balance_loss_mlp": 1.02990103, "epoch": 0.21229520517059972, "flos": 21616464504960.0, "grad_norm": 2.1154933827202274, "language_loss": 0.82973897, "learning_rate": 3.661409515882308e-06, "loss": 0.85171747, "num_input_tokens_seen": 76255965, "step": 3531, "time_per_iteration": 4.168981313705444 }, { "auxiliary_loss_clip": 0.01120152, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.04767489, "balance_loss_mlp": 1.02313459, "epoch": 0.2123553284232677, "flos": 13991049411840.0, "grad_norm": 2.335526210972433, "language_loss": 0.73087364, "learning_rate": 3.661192665917977e-06, "loss": 0.75250214, "num_input_tokens_seen": 76272150, "step": 3532, "time_per_iteration": 2.6797189712524414 }, { "auxiliary_loss_clip": 0.01126693, "auxiliary_loss_mlp": 0.01041409, "balance_loss_clip": 1.0539782, "balance_loss_mlp": 1.02269292, "epoch": 0.21241545167593567, "flos": 18296774570880.0, "grad_norm": 6.22254473074881, "language_loss": 0.74268675, "learning_rate": 3.660975752961054e-06, "loss": 0.76436776, "num_input_tokens_seen": 76291425, "step": 3533, "time_per_iteration": 2.741152048110962 }, { "auxiliary_loss_clip": 0.01146682, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.05342829, "balance_loss_mlp": 1.0265224, "epoch": 0.21247557492860364, "flos": 34713121265280.0, "grad_norm": 2.0406923816018714, "language_loss": 0.70889592, "learning_rate": 3.6607587770197634e-06, "loss": 0.73080653, "num_input_tokens_seen": 76313975, "step": 3534, "time_per_iteration": 2.8210513591766357 }, { "auxiliary_loss_clip": 0.01133157, "auxiliary_loss_mlp": 0.01043651, "balance_loss_clip": 1.05234385, "balance_loss_mlp": 1.02463722, "epoch": 0.2125356981812716, "flos": 22053820504320.0, "grad_norm": 2.102271516852891, "language_loss": 0.71675557, "learning_rate": 3.6605417381023346e-06, "loss": 0.73852366, "num_input_tokens_seen": 76330955, "step": 3535, "time_per_iteration": 2.804506540298462 }, { "auxiliary_loss_clip": 0.01137461, "auxiliary_loss_mlp": 0.01053804, "balance_loss_clip": 1.05108476, "balance_loss_mlp": 1.03607774, "epoch": 0.21259582143393957, "flos": 28548336821760.0, "grad_norm": 24.01704513629389, "language_loss": 0.70639503, "learning_rate": 3.660324636216996e-06, "loss": 0.72830772, "num_input_tokens_seen": 76352680, "step": 3536, "time_per_iteration": 4.442729473114014 }, { "auxiliary_loss_clip": 0.011554, "auxiliary_loss_mlp": 0.01049939, "balance_loss_clip": 1.05231214, "balance_loss_mlp": 1.03082991, "epoch": 0.21265594468660753, "flos": 20120892900480.0, "grad_norm": 2.2527167001205806, "language_loss": 0.8784188, "learning_rate": 3.660107471371981e-06, "loss": 0.90047216, "num_input_tokens_seen": 76370750, "step": 3537, "time_per_iteration": 2.6365723609924316 }, { "auxiliary_loss_clip": 0.01137536, "auxiliary_loss_mlp": 0.00776226, "balance_loss_clip": 1.04911351, "balance_loss_mlp": 1.00101614, "epoch": 0.21271606793927553, "flos": 23076161400960.0, "grad_norm": 1.8080285651248438, "language_loss": 0.80480909, "learning_rate": 3.659890243575524e-06, "loss": 0.82394671, "num_input_tokens_seen": 76390610, "step": 3538, "time_per_iteration": 2.7403554916381836 }, { "auxiliary_loss_clip": 0.01080631, "auxiliary_loss_mlp": 0.0105169, "balance_loss_clip": 1.04171312, "balance_loss_mlp": 1.03219926, "epoch": 0.2127761911919435, "flos": 26388201738240.0, "grad_norm": 2.705287390300715, "language_loss": 0.86691839, "learning_rate": 3.659672952835863e-06, "loss": 0.88824159, "num_input_tokens_seen": 76408860, "step": 3539, "time_per_iteration": 2.8177876472473145 }, { "auxiliary_loss_clip": 0.01120184, "auxiliary_loss_mlp": 0.01047424, "balance_loss_clip": 1.04577422, "balance_loss_mlp": 1.0295074, "epoch": 0.21283631444461146, "flos": 20228265630720.0, "grad_norm": 5.212413836862573, "language_loss": 0.57756186, "learning_rate": 3.659455599161237e-06, "loss": 0.59923792, "num_input_tokens_seen": 76424980, "step": 3540, "time_per_iteration": 2.786552667617798 }, { "auxiliary_loss_clip": 0.01154193, "auxiliary_loss_mlp": 0.010403, "balance_loss_clip": 1.05276537, "balance_loss_mlp": 1.02131045, "epoch": 0.21289643769727942, "flos": 13516992691200.0, "grad_norm": 2.318388810062464, "language_loss": 0.76114893, "learning_rate": 3.659238182559888e-06, "loss": 0.78309381, "num_input_tokens_seen": 76443135, "step": 3541, "time_per_iteration": 2.646207332611084 }, { "auxiliary_loss_clip": 0.01108241, "auxiliary_loss_mlp": 0.01044876, "balance_loss_clip": 1.0464325, "balance_loss_mlp": 1.02676797, "epoch": 0.2129565609499474, "flos": 24827021942400.0, "grad_norm": 3.508596736579257, "language_loss": 0.69749588, "learning_rate": 3.6590207030400615e-06, "loss": 0.71902704, "num_input_tokens_seen": 76462470, "step": 3542, "time_per_iteration": 2.746612787246704 }, { "auxiliary_loss_clip": 0.01149445, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.05146265, "balance_loss_mlp": 1.02160525, "epoch": 0.21301668420261535, "flos": 23659242877440.0, "grad_norm": 2.3488794859192397, "language_loss": 0.75651306, "learning_rate": 3.658803160610004e-06, "loss": 0.77839369, "num_input_tokens_seen": 76481995, "step": 3543, "time_per_iteration": 2.665900230407715 }, { "auxiliary_loss_clip": 0.0112855, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.05257249, "balance_loss_mlp": 1.02409506, "epoch": 0.21307680745528332, "flos": 16362805472640.0, "grad_norm": 1.8076409354305347, "language_loss": 0.66981912, "learning_rate": 3.6585855552779634e-06, "loss": 0.6915251, "num_input_tokens_seen": 76500245, "step": 3544, "time_per_iteration": 2.6692638397216797 }, { "auxiliary_loss_clip": 0.01121216, "auxiliary_loss_mlp": 0.01046395, "balance_loss_clip": 1.0480237, "balance_loss_mlp": 1.02897835, "epoch": 0.2131369307079513, "flos": 19099054794240.0, "grad_norm": 1.8644107460894377, "language_loss": 0.70977402, "learning_rate": 3.6583678870521934e-06, "loss": 0.73145014, "num_input_tokens_seen": 76519535, "step": 3545, "time_per_iteration": 2.686939001083374 }, { "auxiliary_loss_clip": 0.01128605, "auxiliary_loss_mlp": 0.01048325, "balance_loss_clip": 1.05368018, "balance_loss_mlp": 1.0300498, "epoch": 0.21319705396061928, "flos": 30372275583360.0, "grad_norm": 1.8809403827144264, "language_loss": 0.72329843, "learning_rate": 3.658150155940946e-06, "loss": 0.74506772, "num_input_tokens_seen": 76542065, "step": 3546, "time_per_iteration": 2.8044040203094482 }, { "auxiliary_loss_clip": 0.01115103, "auxiliary_loss_mlp": 0.01050245, "balance_loss_clip": 1.0539, "balance_loss_mlp": 1.03250647, "epoch": 0.21325717721328724, "flos": 21756192410880.0, "grad_norm": 3.48585993087404, "language_loss": 0.80431038, "learning_rate": 3.657932361952479e-06, "loss": 0.82596385, "num_input_tokens_seen": 76560540, "step": 3547, "time_per_iteration": 2.7981739044189453 }, { "auxiliary_loss_clip": 0.01154388, "auxiliary_loss_mlp": 0.01045355, "balance_loss_clip": 1.05115056, "balance_loss_mlp": 1.02685428, "epoch": 0.2133173004659552, "flos": 28730870760960.0, "grad_norm": 2.460294966859189, "language_loss": 0.7449761, "learning_rate": 3.6577145050950504e-06, "loss": 0.7669735, "num_input_tokens_seen": 76581760, "step": 3548, "time_per_iteration": 2.709476947784424 }, { "auxiliary_loss_clip": 0.01117193, "auxiliary_loss_mlp": 0.01059153, "balance_loss_clip": 1.05099797, "balance_loss_mlp": 1.03938842, "epoch": 0.21337742371862317, "flos": 16837077674880.0, "grad_norm": 2.783715227630402, "language_loss": 0.74218595, "learning_rate": 3.657496585376922e-06, "loss": 0.76394939, "num_input_tokens_seen": 76599940, "step": 3549, "time_per_iteration": 2.751401662826538 }, { "auxiliary_loss_clip": 0.01121431, "auxiliary_loss_mlp": 0.01050546, "balance_loss_clip": 1.05331278, "balance_loss_mlp": 1.03283179, "epoch": 0.21343754697129114, "flos": 24424930120320.0, "grad_norm": 1.8583266555890872, "language_loss": 0.80719978, "learning_rate": 3.657278602806357e-06, "loss": 0.82891953, "num_input_tokens_seen": 76619580, "step": 3550, "time_per_iteration": 2.74678373336792 }, { "auxiliary_loss_clip": 0.01151996, "auxiliary_loss_mlp": 0.01048347, "balance_loss_clip": 1.05428052, "balance_loss_mlp": 1.03147876, "epoch": 0.21349767022395913, "flos": 19277817805440.0, "grad_norm": 1.7548210279469212, "language_loss": 0.88234103, "learning_rate": 3.657060557391621e-06, "loss": 0.90434444, "num_input_tokens_seen": 76638195, "step": 3551, "time_per_iteration": 2.746938705444336 }, { "auxiliary_loss_clip": 0.01151269, "auxiliary_loss_mlp": 0.01048306, "balance_loss_clip": 1.05139017, "balance_loss_mlp": 1.03111625, "epoch": 0.2135577934766271, "flos": 17347547808000.0, "grad_norm": 1.8976063035050816, "language_loss": 0.83877259, "learning_rate": 3.656842449140983e-06, "loss": 0.86076838, "num_input_tokens_seen": 76656695, "step": 3552, "time_per_iteration": 2.616567373275757 }, { "auxiliary_loss_clip": 0.0113626, "auxiliary_loss_mlp": 0.01050705, "balance_loss_clip": 1.04937124, "balance_loss_mlp": 1.0325495, "epoch": 0.21361791672929506, "flos": 24057204635520.0, "grad_norm": 2.604872460919843, "language_loss": 0.76370007, "learning_rate": 3.656624278062713e-06, "loss": 0.78556973, "num_input_tokens_seen": 76677430, "step": 3553, "time_per_iteration": 2.730829954147339 }, { "auxiliary_loss_clip": 0.01142267, "auxiliary_loss_mlp": 0.01046102, "balance_loss_clip": 1.05434144, "balance_loss_mlp": 1.02915072, "epoch": 0.21367803998196302, "flos": 22162306556160.0, "grad_norm": 1.5008078028945642, "language_loss": 0.72580731, "learning_rate": 3.6564060441650843e-06, "loss": 0.74769098, "num_input_tokens_seen": 76697615, "step": 3554, "time_per_iteration": 2.701207399368286 }, { "auxiliary_loss_clip": 0.01097601, "auxiliary_loss_mlp": 0.00776401, "balance_loss_clip": 1.04785013, "balance_loss_mlp": 1.00128174, "epoch": 0.213738163234631, "flos": 20886867452160.0, "grad_norm": 2.0681583889949957, "language_loss": 0.67728174, "learning_rate": 3.6561877474563724e-06, "loss": 0.69602168, "num_input_tokens_seen": 76715685, "step": 3555, "time_per_iteration": 2.76454758644104 }, { "auxiliary_loss_clip": 0.01124456, "auxiliary_loss_mlp": 0.01045031, "balance_loss_clip": 1.06086278, "balance_loss_mlp": 1.02689981, "epoch": 0.21379828648729896, "flos": 28403114135040.0, "grad_norm": 2.155752981705525, "language_loss": 0.64553648, "learning_rate": 3.6559693879448553e-06, "loss": 0.66723132, "num_input_tokens_seen": 76735405, "step": 3556, "time_per_iteration": 2.839993953704834 }, { "auxiliary_loss_clip": 0.01139371, "auxiliary_loss_mlp": 0.01051642, "balance_loss_clip": 1.05236566, "balance_loss_mlp": 1.0331769, "epoch": 0.21385840973996692, "flos": 25479662106240.0, "grad_norm": 1.7378281716746964, "language_loss": 0.72588408, "learning_rate": 3.6557509656388125e-06, "loss": 0.74779421, "num_input_tokens_seen": 76754395, "step": 3557, "time_per_iteration": 2.7678587436676025 }, { "auxiliary_loss_clip": 0.01151319, "auxiliary_loss_mlp": 0.00776703, "balance_loss_clip": 1.0647192, "balance_loss_mlp": 1.00117195, "epoch": 0.2139185329926349, "flos": 28074280101120.0, "grad_norm": 1.8333462571334693, "language_loss": 0.6714859, "learning_rate": 3.655532480546528e-06, "loss": 0.6907661, "num_input_tokens_seen": 76777210, "step": 3558, "time_per_iteration": 2.7584826946258545 }, { "auxiliary_loss_clip": 0.01159331, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.0541842, "balance_loss_mlp": 1.02297139, "epoch": 0.21397865624530288, "flos": 19608698914560.0, "grad_norm": 1.8974456617751176, "language_loss": 0.79882181, "learning_rate": 3.655313932676286e-06, "loss": 0.82082617, "num_input_tokens_seen": 76795830, "step": 3559, "time_per_iteration": 2.6918041706085205 }, { "auxiliary_loss_clip": 0.01155068, "auxiliary_loss_mlp": 0.01046018, "balance_loss_clip": 1.05566323, "balance_loss_mlp": 1.0295198, "epoch": 0.21403877949797084, "flos": 24681476033280.0, "grad_norm": 1.8730564704536732, "language_loss": 0.68085694, "learning_rate": 3.655095322036373e-06, "loss": 0.70286781, "num_input_tokens_seen": 76814700, "step": 3560, "time_per_iteration": 2.6445770263671875 }, { "auxiliary_loss_clip": 0.01145074, "auxiliary_loss_mlp": 0.01043706, "balance_loss_clip": 1.0535686, "balance_loss_mlp": 1.02537155, "epoch": 0.2140989027506388, "flos": 19861150677120.0, "grad_norm": 1.8952415763477797, "language_loss": 0.73272544, "learning_rate": 3.65487664863508e-06, "loss": 0.75461322, "num_input_tokens_seen": 76833400, "step": 3561, "time_per_iteration": 2.6568899154663086 }, { "auxiliary_loss_clip": 0.01133795, "auxiliary_loss_mlp": 0.01044555, "balance_loss_clip": 1.05333674, "balance_loss_mlp": 1.02700794, "epoch": 0.21415902600330677, "flos": 19135324552320.0, "grad_norm": 2.1953085541278203, "language_loss": 0.78028738, "learning_rate": 3.654657912480698e-06, "loss": 0.80207092, "num_input_tokens_seen": 76850645, "step": 3562, "time_per_iteration": 2.73655104637146 }, { "auxiliary_loss_clip": 0.01155634, "auxiliary_loss_mlp": 0.01042255, "balance_loss_clip": 1.05661631, "balance_loss_mlp": 1.02457595, "epoch": 0.21421914925597474, "flos": 22272624201600.0, "grad_norm": 3.5245068195694937, "language_loss": 0.84338713, "learning_rate": 3.6544391135815237e-06, "loss": 0.86536604, "num_input_tokens_seen": 76870135, "step": 3563, "time_per_iteration": 2.676630973815918 }, { "auxiliary_loss_clip": 0.01157426, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.05830729, "balance_loss_mlp": 1.01957488, "epoch": 0.2142792725086427, "flos": 33875109987840.0, "grad_norm": 1.5172669047015535, "language_loss": 0.76581991, "learning_rate": 3.6542202519458507e-06, "loss": 0.78775525, "num_input_tokens_seen": 76893905, "step": 3564, "time_per_iteration": 2.7504193782806396 }, { "auxiliary_loss_clip": 0.01134427, "auxiliary_loss_mlp": 0.01044002, "balance_loss_clip": 1.06131172, "balance_loss_mlp": 1.02674031, "epoch": 0.2143393957613107, "flos": 19860216923520.0, "grad_norm": 1.7115347614953564, "language_loss": 0.88466394, "learning_rate": 3.654001327581981e-06, "loss": 0.90644825, "num_input_tokens_seen": 76914205, "step": 3565, "time_per_iteration": 2.7911624908447266 }, { "auxiliary_loss_clip": 0.01071735, "auxiliary_loss_mlp": 0.01008336, "balance_loss_clip": 1.05462575, "balance_loss_mlp": 1.0057019, "epoch": 0.21439951901397866, "flos": 68530093090560.0, "grad_norm": 0.8339683756542131, "language_loss": 0.52192736, "learning_rate": 3.653782340498215e-06, "loss": 0.54272807, "num_input_tokens_seen": 76975650, "step": 3566, "time_per_iteration": 3.1801936626434326 }, { "auxiliary_loss_clip": 0.01141614, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.05527854, "balance_loss_mlp": 1.02505386, "epoch": 0.21445964226664663, "flos": 19682998197120.0, "grad_norm": 1.8485820369681922, "language_loss": 0.67324477, "learning_rate": 3.6535632907028566e-06, "loss": 0.6950742, "num_input_tokens_seen": 76992615, "step": 3567, "time_per_iteration": 2.6948626041412354 }, { "auxiliary_loss_clip": 0.01123629, "auxiliary_loss_mlp": 0.01045447, "balance_loss_clip": 1.05142832, "balance_loss_mlp": 1.02749455, "epoch": 0.2145197655193146, "flos": 31107259676160.0, "grad_norm": 3.2542445550844317, "language_loss": 0.74213678, "learning_rate": 3.6533441782042126e-06, "loss": 0.76382756, "num_input_tokens_seen": 77017005, "step": 3568, "time_per_iteration": 4.396210670471191 }, { "auxiliary_loss_clip": 0.01140095, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.05480075, "balance_loss_mlp": 1.03333998, "epoch": 0.21457988877198256, "flos": 20120785159680.0, "grad_norm": 1.7132363384404574, "language_loss": 0.77343202, "learning_rate": 3.6531250030105917e-06, "loss": 0.79533333, "num_input_tokens_seen": 77034990, "step": 3569, "time_per_iteration": 4.224002122879028 }, { "auxiliary_loss_clip": 0.011511, "auxiliary_loss_mlp": 0.0104435, "balance_loss_clip": 1.05651093, "balance_loss_mlp": 1.02521753, "epoch": 0.21464001202465052, "flos": 18588045957120.0, "grad_norm": 2.6050136504577583, "language_loss": 0.70278227, "learning_rate": 3.6529057651303053e-06, "loss": 0.72473681, "num_input_tokens_seen": 77052610, "step": 3570, "time_per_iteration": 2.668304681777954 }, { "auxiliary_loss_clip": 0.01158856, "auxiliary_loss_mlp": 0.01046783, "balance_loss_clip": 1.05765057, "balance_loss_mlp": 1.02955759, "epoch": 0.21470013527731852, "flos": 21835160461440.0, "grad_norm": 2.5503136440013647, "language_loss": 0.79031628, "learning_rate": 3.6526864645716666e-06, "loss": 0.81237268, "num_input_tokens_seen": 77072475, "step": 3571, "time_per_iteration": 4.066440105438232 }, { "auxiliary_loss_clip": 0.0113831, "auxiliary_loss_mlp": 0.01047146, "balance_loss_clip": 1.05283594, "balance_loss_mlp": 1.02703547, "epoch": 0.21476025852998648, "flos": 17603195880960.0, "grad_norm": 1.9606975528380188, "language_loss": 0.82601345, "learning_rate": 3.652467101342991e-06, "loss": 0.84786803, "num_input_tokens_seen": 77089930, "step": 3572, "time_per_iteration": 2.6096267700195312 }, { "auxiliary_loss_clip": 0.01134964, "auxiliary_loss_mlp": 0.01041355, "balance_loss_clip": 1.05588293, "balance_loss_mlp": 1.02358127, "epoch": 0.21482038178265445, "flos": 24828135264000.0, "grad_norm": 4.1014522432452285, "language_loss": 0.65240026, "learning_rate": 3.652247675452598e-06, "loss": 0.67416352, "num_input_tokens_seen": 77108970, "step": 3573, "time_per_iteration": 2.690986394882202 }, { "auxiliary_loss_clip": 0.01147698, "auxiliary_loss_mlp": 0.01048414, "balance_loss_clip": 1.05253768, "balance_loss_mlp": 1.03140295, "epoch": 0.2148805050353224, "flos": 23258228463360.0, "grad_norm": 2.3397683674355565, "language_loss": 0.75229824, "learning_rate": 3.652028186908807e-06, "loss": 0.77425939, "num_input_tokens_seen": 77126045, "step": 3574, "time_per_iteration": 2.621736526489258 }, { "auxiliary_loss_clip": 0.01138272, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.0526228, "balance_loss_mlp": 1.02414417, "epoch": 0.21494062828799038, "flos": 21321098968320.0, "grad_norm": 1.8157113535402463, "language_loss": 0.72179317, "learning_rate": 3.6518086357199416e-06, "loss": 0.74359143, "num_input_tokens_seen": 77144600, "step": 3575, "time_per_iteration": 4.362869501113892 }, { "auxiliary_loss_clip": 0.01126687, "auxiliary_loss_mlp": 0.01041186, "balance_loss_clip": 1.05261374, "balance_loss_mlp": 1.02422237, "epoch": 0.21500075154065834, "flos": 18843334894080.0, "grad_norm": 3.8402092268612216, "language_loss": 0.68255925, "learning_rate": 3.6515890218943277e-06, "loss": 0.70423794, "num_input_tokens_seen": 77162965, "step": 3576, "time_per_iteration": 2.665370225906372 }, { "auxiliary_loss_clip": 0.01138295, "auxiliary_loss_mlp": 0.01049053, "balance_loss_clip": 1.05064976, "balance_loss_mlp": 1.02859676, "epoch": 0.2150608747933263, "flos": 18441997257600.0, "grad_norm": 2.2101409055401566, "language_loss": 0.88707685, "learning_rate": 3.651369345440292e-06, "loss": 0.90895033, "num_input_tokens_seen": 77179960, "step": 3577, "time_per_iteration": 2.655118465423584 }, { "auxiliary_loss_clip": 0.01070337, "auxiliary_loss_mlp": 0.01022454, "balance_loss_clip": 1.0487709, "balance_loss_mlp": 1.01998615, "epoch": 0.2151209980459943, "flos": 66598242894720.0, "grad_norm": 0.8146982557647512, "language_loss": 0.56184745, "learning_rate": 3.6511496063661654e-06, "loss": 0.58277535, "num_input_tokens_seen": 77239500, "step": 3578, "time_per_iteration": 3.2133536338806152 }, { "auxiliary_loss_clip": 0.01144391, "auxiliary_loss_mlp": 0.00775114, "balance_loss_clip": 1.05492067, "balance_loss_mlp": 1.00130272, "epoch": 0.21518112129866226, "flos": 21575885114880.0, "grad_norm": 2.988933296047806, "language_loss": 0.88686001, "learning_rate": 3.6509298046802807e-06, "loss": 0.90605509, "num_input_tokens_seen": 77254680, "step": 3579, "time_per_iteration": 2.6801605224609375 }, { "auxiliary_loss_clip": 0.01143273, "auxiliary_loss_mlp": 0.0104707, "balance_loss_clip": 1.05253708, "balance_loss_mlp": 1.02945101, "epoch": 0.21524124455133023, "flos": 20047635112320.0, "grad_norm": 1.8556029181899094, "language_loss": 0.77953792, "learning_rate": 3.650709940390972e-06, "loss": 0.80144137, "num_input_tokens_seen": 77274060, "step": 3580, "time_per_iteration": 2.6932644844055176 }, { "auxiliary_loss_clip": 0.01145284, "auxiliary_loss_mlp": 0.01043211, "balance_loss_clip": 1.05702484, "balance_loss_mlp": 1.02543712, "epoch": 0.2153013678039982, "flos": 23951807153280.0, "grad_norm": 1.9843281400180077, "language_loss": 0.72948015, "learning_rate": 3.6504900135065775e-06, "loss": 0.75136507, "num_input_tokens_seen": 77293255, "step": 3581, "time_per_iteration": 2.712376117706299 }, { "auxiliary_loss_clip": 0.01138503, "auxiliary_loss_mlp": 0.0104555, "balance_loss_clip": 1.05348194, "balance_loss_mlp": 1.0269891, "epoch": 0.21536149105666616, "flos": 20594841880320.0, "grad_norm": 2.4257233983700113, "language_loss": 0.70726413, "learning_rate": 3.6502700240354357e-06, "loss": 0.72910464, "num_input_tokens_seen": 77312390, "step": 3582, "time_per_iteration": 2.67122220993042 }, { "auxiliary_loss_clip": 0.01154755, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.05591798, "balance_loss_mlp": 1.0227195, "epoch": 0.21542161430933413, "flos": 12860042895360.0, "grad_norm": 2.4025311229753363, "language_loss": 0.84906816, "learning_rate": 3.650049971985889e-06, "loss": 0.87101901, "num_input_tokens_seen": 77330985, "step": 3583, "time_per_iteration": 2.6395328044891357 }, { "auxiliary_loss_clip": 0.01133287, "auxiliary_loss_mlp": 0.01047024, "balance_loss_clip": 1.05368245, "balance_loss_mlp": 1.02971518, "epoch": 0.21548173756200212, "flos": 26103933504000.0, "grad_norm": 2.7569743809923533, "language_loss": 0.83223897, "learning_rate": 3.6498298573662824e-06, "loss": 0.85404205, "num_input_tokens_seen": 77350770, "step": 3584, "time_per_iteration": 2.730823040008545 }, { "auxiliary_loss_clip": 0.01118851, "auxiliary_loss_mlp": 0.00774813, "balance_loss_clip": 1.0520674, "balance_loss_mlp": 1.00120699, "epoch": 0.21554186081467008, "flos": 22163779013760.0, "grad_norm": 1.9634031706782962, "language_loss": 0.90054697, "learning_rate": 3.6496096801849625e-06, "loss": 0.9194836, "num_input_tokens_seen": 77370510, "step": 3585, "time_per_iteration": 2.722216844558716 }, { "auxiliary_loss_clip": 0.01145179, "auxiliary_loss_mlp": 0.01045359, "balance_loss_clip": 1.05783939, "balance_loss_mlp": 1.02793026, "epoch": 0.21560198406733805, "flos": 22966741595520.0, "grad_norm": 1.9859337557251673, "language_loss": 0.74663597, "learning_rate": 3.649389440450277e-06, "loss": 0.76854134, "num_input_tokens_seen": 77390645, "step": 3586, "time_per_iteration": 2.7681503295898438 }, { "auxiliary_loss_clip": 0.01120328, "auxiliary_loss_mlp": 0.01046334, "balance_loss_clip": 1.05628061, "balance_loss_mlp": 1.03011, "epoch": 0.215662107320006, "flos": 22784064001920.0, "grad_norm": 2.903090853788092, "language_loss": 0.83029532, "learning_rate": 3.6491691381705804e-06, "loss": 0.85196197, "num_input_tokens_seen": 77409655, "step": 3587, "time_per_iteration": 2.788416624069214 }, { "auxiliary_loss_clip": 0.01109364, "auxiliary_loss_mlp": 0.00776304, "balance_loss_clip": 1.05255485, "balance_loss_mlp": 1.00129569, "epoch": 0.21572223057267398, "flos": 30883859038080.0, "grad_norm": 1.7067147212291012, "language_loss": 0.75593436, "learning_rate": 3.648948773354224e-06, "loss": 0.774791, "num_input_tokens_seen": 77430560, "step": 3588, "time_per_iteration": 2.866584062576294 }, { "auxiliary_loss_clip": 0.01136336, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.04921389, "balance_loss_mlp": 1.0224762, "epoch": 0.21578235382534194, "flos": 26910487445760.0, "grad_norm": 1.721393113594195, "language_loss": 0.80745661, "learning_rate": 3.6487283460095643e-06, "loss": 0.82921582, "num_input_tokens_seen": 77455000, "step": 3589, "time_per_iteration": 2.8839404582977295 }, { "auxiliary_loss_clip": 0.01157121, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.05677748, "balance_loss_mlp": 1.01992083, "epoch": 0.2158424770780099, "flos": 24425720219520.0, "grad_norm": 2.201221744880259, "language_loss": 0.72849286, "learning_rate": 3.648507856144961e-06, "loss": 0.75042707, "num_input_tokens_seen": 77475075, "step": 3590, "time_per_iteration": 2.6692256927490234 }, { "auxiliary_loss_clip": 0.01134591, "auxiliary_loss_mlp": 0.01044904, "balance_loss_clip": 1.05195427, "balance_loss_mlp": 1.02623618, "epoch": 0.2159026003306779, "flos": 23949975559680.0, "grad_norm": 2.25677544320114, "language_loss": 0.8402462, "learning_rate": 3.648287303768775e-06, "loss": 0.86204112, "num_input_tokens_seen": 77495945, "step": 3591, "time_per_iteration": 2.7531416416168213 }, { "auxiliary_loss_clip": 0.01123784, "auxiliary_loss_mlp": 0.01049552, "balance_loss_clip": 1.05391979, "balance_loss_mlp": 1.02972734, "epoch": 0.21596272358334587, "flos": 30040963511040.0, "grad_norm": 2.2410681113576585, "language_loss": 0.69175243, "learning_rate": 3.6480666888893686e-06, "loss": 0.71348578, "num_input_tokens_seen": 77517140, "step": 3592, "time_per_iteration": 2.8716177940368652 }, { "auxiliary_loss_clip": 0.01117322, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.04998767, "balance_loss_mlp": 1.03179634, "epoch": 0.21602284683601383, "flos": 20376217751040.0, "grad_norm": 2.3652325886308123, "language_loss": 0.84022737, "learning_rate": 3.647846011515108e-06, "loss": 0.86190724, "num_input_tokens_seen": 77536085, "step": 3593, "time_per_iteration": 2.7185158729553223 }, { "auxiliary_loss_clip": 0.01123006, "auxiliary_loss_mlp": 0.01048394, "balance_loss_clip": 1.05243289, "balance_loss_mlp": 1.029809, "epoch": 0.2160829700886818, "flos": 20777339905920.0, "grad_norm": 4.017970268493579, "language_loss": 0.75192308, "learning_rate": 3.6476252716543625e-06, "loss": 0.77363706, "num_input_tokens_seen": 77553675, "step": 3594, "time_per_iteration": 2.726027011871338 }, { "auxiliary_loss_clip": 0.01140408, "auxiliary_loss_mlp": 0.01044406, "balance_loss_clip": 1.05318236, "balance_loss_mlp": 1.02650058, "epoch": 0.21614309334134976, "flos": 22309755886080.0, "grad_norm": 1.541030891618627, "language_loss": 0.80459857, "learning_rate": 3.6474044693155007e-06, "loss": 0.82644665, "num_input_tokens_seen": 77573360, "step": 3595, "time_per_iteration": 2.66504168510437 }, { "auxiliary_loss_clip": 0.01119754, "auxiliary_loss_mlp": 0.01039521, "balance_loss_clip": 1.05060601, "balance_loss_mlp": 1.02125788, "epoch": 0.21620321659401773, "flos": 19609524927360.0, "grad_norm": 2.1030283577585007, "language_loss": 0.78930759, "learning_rate": 3.647183604506897e-06, "loss": 0.81090033, "num_input_tokens_seen": 77591865, "step": 3596, "time_per_iteration": 2.7159698009490967 }, { "auxiliary_loss_clip": 0.01080261, "auxiliary_loss_mlp": 0.01047978, "balance_loss_clip": 1.04591155, "balance_loss_mlp": 1.03106225, "epoch": 0.2162633398466857, "flos": 18844555956480.0, "grad_norm": 1.6709210997095376, "language_loss": 0.83061242, "learning_rate": 3.6469626772369253e-06, "loss": 0.85189474, "num_input_tokens_seen": 77611600, "step": 3597, "time_per_iteration": 2.79276704788208 }, { "auxiliary_loss_clip": 0.01133147, "auxiliary_loss_mlp": 0.00775626, "balance_loss_clip": 1.05385637, "balance_loss_mlp": 1.00146937, "epoch": 0.21632346309935369, "flos": 18768820129920.0, "grad_norm": 1.6388312470031852, "language_loss": 0.80549502, "learning_rate": 3.6467416875139642e-06, "loss": 0.8245827, "num_input_tokens_seen": 77630665, "step": 3598, "time_per_iteration": 2.6823580265045166 }, { "auxiliary_loss_clip": 0.01123845, "auxiliary_loss_mlp": 0.01051638, "balance_loss_clip": 1.05069876, "balance_loss_mlp": 1.03218365, "epoch": 0.21638358635202165, "flos": 26324173745280.0, "grad_norm": 1.9066675721358164, "language_loss": 0.82023275, "learning_rate": 3.6465206353463934e-06, "loss": 0.84198749, "num_input_tokens_seen": 77650835, "step": 3599, "time_per_iteration": 2.73583722114563 }, { "auxiliary_loss_clip": 0.0110774, "auxiliary_loss_mlp": 0.00775854, "balance_loss_clip": 1.04651821, "balance_loss_mlp": 1.00131536, "epoch": 0.21644370960468962, "flos": 20740854666240.0, "grad_norm": 2.996184273033617, "language_loss": 0.76724887, "learning_rate": 3.6462995207425947e-06, "loss": 0.78608489, "num_input_tokens_seen": 77669000, "step": 3600, "time_per_iteration": 2.695081949234009 }, { "auxiliary_loss_clip": 0.01112458, "auxiliary_loss_mlp": 0.01044855, "balance_loss_clip": 1.04869664, "balance_loss_mlp": 1.02886891, "epoch": 0.21650383285735758, "flos": 23952238116480.0, "grad_norm": 2.259096111885494, "language_loss": 0.80784452, "learning_rate": 3.6460783437109533e-06, "loss": 0.82941765, "num_input_tokens_seen": 77688745, "step": 3601, "time_per_iteration": 2.8094849586486816 }, { "auxiliary_loss_clip": 0.01155408, "auxiliary_loss_mlp": 0.01046912, "balance_loss_clip": 1.0550983, "balance_loss_mlp": 1.02973413, "epoch": 0.21656395611002555, "flos": 23696087253120.0, "grad_norm": 2.558776342313561, "language_loss": 0.83192647, "learning_rate": 3.6458571042598565e-06, "loss": 0.85394967, "num_input_tokens_seen": 77708445, "step": 3602, "time_per_iteration": 2.652876377105713 }, { "auxiliary_loss_clip": 0.0115161, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.0525223, "balance_loss_mlp": 1.03286743, "epoch": 0.2166240793626935, "flos": 20666052593280.0, "grad_norm": 1.768938326380195, "language_loss": 0.7449019, "learning_rate": 3.645635802397693e-06, "loss": 0.76692116, "num_input_tokens_seen": 77728465, "step": 3603, "time_per_iteration": 2.619614601135254 }, { "auxiliary_loss_clip": 0.01116481, "auxiliary_loss_mlp": 0.01047384, "balance_loss_clip": 1.04873598, "balance_loss_mlp": 1.02883554, "epoch": 0.2166842026153615, "flos": 21580410228480.0, "grad_norm": 1.6710689829239502, "language_loss": 0.74178421, "learning_rate": 3.645414438132855e-06, "loss": 0.76342291, "num_input_tokens_seen": 77746735, "step": 3604, "time_per_iteration": 2.730182647705078 }, { "auxiliary_loss_clip": 0.01138214, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.05246544, "balance_loss_mlp": 1.02124691, "epoch": 0.21674432586802947, "flos": 25629948610560.0, "grad_norm": 1.7167946204354523, "language_loss": 0.7990489, "learning_rate": 3.6451930114737366e-06, "loss": 0.82081187, "num_input_tokens_seen": 77768105, "step": 3605, "time_per_iteration": 2.67668080329895 }, { "auxiliary_loss_clip": 0.01079717, "auxiliary_loss_mlp": 0.01002026, "balance_loss_clip": 1.0400598, "balance_loss_mlp": 0.99942732, "epoch": 0.21680444912069743, "flos": 56417783616000.0, "grad_norm": 0.7112415560884942, "language_loss": 0.5834192, "learning_rate": 3.6449715224287347e-06, "loss": 0.6042366, "num_input_tokens_seen": 77833750, "step": 3606, "time_per_iteration": 3.2736570835113525 }, { "auxiliary_loss_clip": 0.01155294, "auxiliary_loss_mlp": 0.01043491, "balance_loss_clip": 1.05404341, "balance_loss_mlp": 1.02498984, "epoch": 0.2168645723733654, "flos": 23878944414720.0, "grad_norm": 2.2731951350022275, "language_loss": 0.73142302, "learning_rate": 3.644749971006248e-06, "loss": 0.75341088, "num_input_tokens_seen": 77853780, "step": 3607, "time_per_iteration": 4.267899990081787 }, { "auxiliary_loss_clip": 0.01133762, "auxiliary_loss_mlp": 0.01046639, "balance_loss_clip": 1.05282903, "balance_loss_mlp": 1.02789962, "epoch": 0.21692469562603336, "flos": 16946174257920.0, "grad_norm": 2.181379073292718, "language_loss": 0.76540339, "learning_rate": 3.6445283572146765e-06, "loss": 0.78720737, "num_input_tokens_seen": 77872575, "step": 3608, "time_per_iteration": 4.285630464553833 }, { "auxiliary_loss_clip": 0.01080204, "auxiliary_loss_mlp": 0.01047623, "balance_loss_clip": 1.04536235, "balance_loss_mlp": 1.0309217, "epoch": 0.21698481887870133, "flos": 25119047514240.0, "grad_norm": 2.042587105390135, "language_loss": 0.74584132, "learning_rate": 3.6443066810624255e-06, "loss": 0.76711953, "num_input_tokens_seen": 77892700, "step": 3609, "time_per_iteration": 2.802569627761841 }, { "auxiliary_loss_clip": 0.01131798, "auxiliary_loss_mlp": 0.01049353, "balance_loss_clip": 1.05227149, "balance_loss_mlp": 1.03159094, "epoch": 0.2170449421313693, "flos": 17894682748800.0, "grad_norm": 1.9074832440543417, "language_loss": 0.89132321, "learning_rate": 3.6440849425579e-06, "loss": 0.91313475, "num_input_tokens_seen": 77911060, "step": 3610, "time_per_iteration": 4.189727306365967 }, { "auxiliary_loss_clip": 0.01155294, "auxiliary_loss_mlp": 0.01044238, "balance_loss_clip": 1.05534768, "balance_loss_mlp": 1.02649963, "epoch": 0.2171050653840373, "flos": 22638446265600.0, "grad_norm": 2.058717355808165, "language_loss": 0.77779067, "learning_rate": 3.6438631417095095e-06, "loss": 0.79978603, "num_input_tokens_seen": 77929930, "step": 3611, "time_per_iteration": 2.6317896842956543 }, { "auxiliary_loss_clip": 0.01088447, "auxiliary_loss_mlp": 0.01047447, "balance_loss_clip": 1.04764366, "balance_loss_mlp": 1.03026867, "epoch": 0.21716518863670525, "flos": 19499997381120.0, "grad_norm": 2.3883055198257184, "language_loss": 0.63578451, "learning_rate": 3.6436412785256637e-06, "loss": 0.65714347, "num_input_tokens_seen": 77949060, "step": 3612, "time_per_iteration": 2.8771228790283203 }, { "auxiliary_loss_clip": 0.01091118, "auxiliary_loss_mlp": 0.01053996, "balance_loss_clip": 1.04585218, "balance_loss_mlp": 1.03454065, "epoch": 0.21722531188937322, "flos": 19792022952960.0, "grad_norm": 1.801964584441428, "language_loss": 0.75912857, "learning_rate": 3.643419353014776e-06, "loss": 0.78057969, "num_input_tokens_seen": 77967920, "step": 3613, "time_per_iteration": 2.710601568222046 }, { "auxiliary_loss_clip": 0.0110572, "auxiliary_loss_mlp": 0.01051253, "balance_loss_clip": 1.05008733, "balance_loss_mlp": 1.03121352, "epoch": 0.21728543514204118, "flos": 13334386924800.0, "grad_norm": 1.9293696862218277, "language_loss": 0.71047795, "learning_rate": 3.643197365185261e-06, "loss": 0.73204768, "num_input_tokens_seen": 77985330, "step": 3614, "time_per_iteration": 4.407632112503052 }, { "auxiliary_loss_clip": 0.0114355, "auxiliary_loss_mlp": 0.01048776, "balance_loss_clip": 1.05521107, "balance_loss_mlp": 1.0306083, "epoch": 0.21734555839470915, "flos": 15231870783360.0, "grad_norm": 1.7289280951335333, "language_loss": 0.73030001, "learning_rate": 3.6429753150455378e-06, "loss": 0.75222325, "num_input_tokens_seen": 78003105, "step": 3615, "time_per_iteration": 2.6358401775360107 }, { "auxiliary_loss_clip": 0.01145731, "auxiliary_loss_mlp": 0.01046632, "balance_loss_clip": 1.05206716, "balance_loss_mlp": 1.02703404, "epoch": 0.2174056816473771, "flos": 19973982274560.0, "grad_norm": 2.3648922858816976, "language_loss": 0.90127194, "learning_rate": 3.6427532026040263e-06, "loss": 0.92319548, "num_input_tokens_seen": 78019655, "step": 3616, "time_per_iteration": 2.659787178039551 }, { "auxiliary_loss_clip": 0.01103597, "auxiliary_loss_mlp": 0.01040899, "balance_loss_clip": 1.048136, "balance_loss_mlp": 1.02244496, "epoch": 0.21746580490004508, "flos": 16687293960960.0, "grad_norm": 2.928463545610362, "language_loss": 0.81107831, "learning_rate": 3.642531027869148e-06, "loss": 0.83252329, "num_input_tokens_seen": 78036025, "step": 3617, "time_per_iteration": 2.7723491191864014 }, { "auxiliary_loss_clip": 0.01132531, "auxiliary_loss_mlp": 0.01041286, "balance_loss_clip": 1.05330408, "balance_loss_mlp": 1.02382231, "epoch": 0.21752592815271307, "flos": 25772298209280.0, "grad_norm": 1.9251992817215786, "language_loss": 0.75688154, "learning_rate": 3.642308790849329e-06, "loss": 0.77861977, "num_input_tokens_seen": 78055645, "step": 3618, "time_per_iteration": 2.7608227729797363 }, { "auxiliary_loss_clip": 0.01147874, "auxiliary_loss_mlp": 0.01048647, "balance_loss_clip": 1.05600834, "balance_loss_mlp": 1.03045571, "epoch": 0.21758605140538104, "flos": 11254692349440.0, "grad_norm": 2.18435089101569, "language_loss": 0.69099152, "learning_rate": 3.642086491552996e-06, "loss": 0.71295673, "num_input_tokens_seen": 78071660, "step": 3619, "time_per_iteration": 2.671637773513794 }, { "auxiliary_loss_clip": 0.01144421, "auxiliary_loss_mlp": 0.01042659, "balance_loss_clip": 1.05394137, "balance_loss_mlp": 1.02482569, "epoch": 0.217646174658049, "flos": 19242625455360.0, "grad_norm": 4.829425462001391, "language_loss": 0.78716505, "learning_rate": 3.641864129988579e-06, "loss": 0.8090359, "num_input_tokens_seen": 78091265, "step": 3620, "time_per_iteration": 2.7232043743133545 }, { "auxiliary_loss_clip": 0.01148457, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.05161178, "balance_loss_mlp": 1.02507412, "epoch": 0.21770629791071697, "flos": 21945083057280.0, "grad_norm": 1.4663479636678602, "language_loss": 0.79966211, "learning_rate": 3.641641706164509e-06, "loss": 0.82156777, "num_input_tokens_seen": 78110095, "step": 3621, "time_per_iteration": 2.6326823234558105 }, { "auxiliary_loss_clip": 0.01143183, "auxiliary_loss_mlp": 0.01035793, "balance_loss_clip": 1.05334592, "balance_loss_mlp": 1.01955688, "epoch": 0.21776642116338493, "flos": 24936764970240.0, "grad_norm": 1.609721344037994, "language_loss": 0.87796915, "learning_rate": 3.641419220089221e-06, "loss": 0.89975888, "num_input_tokens_seen": 78129475, "step": 3622, "time_per_iteration": 2.6864428520202637 }, { "auxiliary_loss_clip": 0.01146899, "auxiliary_loss_mlp": 0.01037591, "balance_loss_clip": 1.05495822, "balance_loss_mlp": 1.01801729, "epoch": 0.2178265444160529, "flos": 17821317219840.0, "grad_norm": 1.856609178217172, "language_loss": 0.77077621, "learning_rate": 3.641196671771152e-06, "loss": 0.79262108, "num_input_tokens_seen": 78146880, "step": 3623, "time_per_iteration": 2.743601083755493 }, { "auxiliary_loss_clip": 0.01121788, "auxiliary_loss_mlp": 0.01052122, "balance_loss_clip": 1.05279899, "balance_loss_mlp": 1.03226197, "epoch": 0.2178866676687209, "flos": 17712902995200.0, "grad_norm": 2.4362835431673036, "language_loss": 0.84600008, "learning_rate": 3.640974061218741e-06, "loss": 0.86773914, "num_input_tokens_seen": 78165065, "step": 3624, "time_per_iteration": 2.7499353885650635 }, { "auxiliary_loss_clip": 0.01139543, "auxiliary_loss_mlp": 0.01057514, "balance_loss_clip": 1.05353129, "balance_loss_mlp": 1.03804684, "epoch": 0.21794679092138886, "flos": 16945851035520.0, "grad_norm": 2.4333310175924905, "language_loss": 0.78037703, "learning_rate": 3.640751388440429e-06, "loss": 0.80234766, "num_input_tokens_seen": 78180005, "step": 3625, "time_per_iteration": 2.6314821243286133 }, { "auxiliary_loss_clip": 0.01061536, "auxiliary_loss_mlp": 0.01003869, "balance_loss_clip": 1.03318405, "balance_loss_mlp": 1.00130582, "epoch": 0.21800691417405682, "flos": 63718566566400.0, "grad_norm": 0.8242097668179436, "language_loss": 0.60701489, "learning_rate": 3.64052865344466e-06, "loss": 0.62766898, "num_input_tokens_seen": 78245350, "step": 3626, "time_per_iteration": 3.257289409637451 }, { "auxiliary_loss_clip": 0.0112643, "auxiliary_loss_mlp": 0.00776719, "balance_loss_clip": 1.05120194, "balance_loss_mlp": 1.00134754, "epoch": 0.21806703742672479, "flos": 21616392677760.0, "grad_norm": 2.2464694521793094, "language_loss": 0.9077245, "learning_rate": 3.6403058562398795e-06, "loss": 0.92675602, "num_input_tokens_seen": 78264165, "step": 3627, "time_per_iteration": 2.6639885902404785 }, { "auxiliary_loss_clip": 0.0109778, "auxiliary_loss_mlp": 0.01043665, "balance_loss_clip": 1.04912198, "balance_loss_mlp": 1.02471113, "epoch": 0.21812716067939275, "flos": 19354882435200.0, "grad_norm": 1.8437472480823303, "language_loss": 0.73480809, "learning_rate": 3.6400829968345365e-06, "loss": 0.75622261, "num_input_tokens_seen": 78283745, "step": 3628, "time_per_iteration": 2.7430238723754883 }, { "auxiliary_loss_clip": 0.01151444, "auxiliary_loss_mlp": 0.01042108, "balance_loss_clip": 1.05143893, "balance_loss_mlp": 1.02391696, "epoch": 0.21818728393206072, "flos": 23548063305600.0, "grad_norm": 2.8127332529660296, "language_loss": 0.77337319, "learning_rate": 3.6398600752370826e-06, "loss": 0.79530871, "num_input_tokens_seen": 78302900, "step": 3629, "time_per_iteration": 2.6468687057495117 }, { "auxiliary_loss_clip": 0.01142447, "auxiliary_loss_mlp": 0.01044137, "balance_loss_clip": 1.0532223, "balance_loss_mlp": 1.02709055, "epoch": 0.21824740718472868, "flos": 30225652266240.0, "grad_norm": 1.7154004506833416, "language_loss": 0.71373391, "learning_rate": 3.63963709145597e-06, "loss": 0.73559982, "num_input_tokens_seen": 78326470, "step": 3630, "time_per_iteration": 2.7334208488464355 }, { "auxiliary_loss_clip": 0.01089422, "auxiliary_loss_mlp": 0.01040838, "balance_loss_clip": 1.04771948, "balance_loss_mlp": 1.02488792, "epoch": 0.21830753043739667, "flos": 26134672567680.0, "grad_norm": 2.4394061962398625, "language_loss": 0.76502508, "learning_rate": 3.6394140454996544e-06, "loss": 0.78632766, "num_input_tokens_seen": 78345810, "step": 3631, "time_per_iteration": 2.9277098178863525 }, { "auxiliary_loss_clip": 0.01153805, "auxiliary_loss_mlp": 0.01036973, "balance_loss_clip": 1.05322635, "balance_loss_mlp": 1.01950908, "epoch": 0.21836765369006464, "flos": 21720712752000.0, "grad_norm": 3.3333075141454556, "language_loss": 0.75291955, "learning_rate": 3.639190937376594e-06, "loss": 0.77482736, "num_input_tokens_seen": 78364085, "step": 3632, "time_per_iteration": 2.666961908340454 }, { "auxiliary_loss_clip": 0.01149425, "auxiliary_loss_mlp": 0.01038996, "balance_loss_clip": 1.05168736, "balance_loss_mlp": 1.02262831, "epoch": 0.2184277769427326, "flos": 19937604775680.0, "grad_norm": 2.135610011090477, "language_loss": 0.83723396, "learning_rate": 3.638967767095249e-06, "loss": 0.85911822, "num_input_tokens_seen": 78381385, "step": 3633, "time_per_iteration": 2.6193437576293945 }, { "auxiliary_loss_clip": 0.0112373, "auxiliary_loss_mlp": 0.01049933, "balance_loss_clip": 1.05514872, "balance_loss_mlp": 1.03280258, "epoch": 0.21848790019540057, "flos": 20340235301760.0, "grad_norm": 1.713148643324746, "language_loss": 0.81381643, "learning_rate": 3.6387445346640823e-06, "loss": 0.83555305, "num_input_tokens_seen": 78400500, "step": 3634, "time_per_iteration": 2.7383267879486084 }, { "auxiliary_loss_clip": 0.01144832, "auxiliary_loss_mlp": 0.01040423, "balance_loss_clip": 1.0548327, "balance_loss_mlp": 1.02263677, "epoch": 0.21854802344806853, "flos": 15450818135040.0, "grad_norm": 1.8988648345390304, "language_loss": 0.74810624, "learning_rate": 3.638521240091558e-06, "loss": 0.76995879, "num_input_tokens_seen": 78418340, "step": 3635, "time_per_iteration": 2.7461390495300293 }, { "auxiliary_loss_clip": 0.01124703, "auxiliary_loss_mlp": 0.01052922, "balance_loss_clip": 1.05011106, "balance_loss_mlp": 1.03524303, "epoch": 0.2186081467007365, "flos": 16320717711360.0, "grad_norm": 2.2147010555825295, "language_loss": 0.88340998, "learning_rate": 3.6382978833861445e-06, "loss": 0.90518618, "num_input_tokens_seen": 78434375, "step": 3636, "time_per_iteration": 2.631352186203003 }, { "auxiliary_loss_clip": 0.01121776, "auxiliary_loss_mlp": 0.00776363, "balance_loss_clip": 1.05596519, "balance_loss_mlp": 1.00133038, "epoch": 0.2186682699534045, "flos": 21689255416320.0, "grad_norm": 2.464516707854487, "language_loss": 0.76037598, "learning_rate": 3.638074464556311e-06, "loss": 0.77935731, "num_input_tokens_seen": 78451735, "step": 3637, "time_per_iteration": 2.823063373565674 }, { "auxiliary_loss_clip": 0.01137371, "auxiliary_loss_mlp": 0.0104323, "balance_loss_clip": 1.05512452, "balance_loss_mlp": 1.02393031, "epoch": 0.21872839320607246, "flos": 17739260599680.0, "grad_norm": 2.6753688852020328, "language_loss": 0.89996254, "learning_rate": 3.63785098361053e-06, "loss": 0.92176855, "num_input_tokens_seen": 78462730, "step": 3638, "time_per_iteration": 2.6404030323028564 }, { "auxiliary_loss_clip": 0.01142035, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.0538702, "balance_loss_mlp": 1.03351748, "epoch": 0.21878851645874042, "flos": 18652289431680.0, "grad_norm": 2.4375531856602692, "language_loss": 0.89243078, "learning_rate": 3.637627440557275e-06, "loss": 0.91436994, "num_input_tokens_seen": 78476300, "step": 3639, "time_per_iteration": 2.6214118003845215 }, { "auxiliary_loss_clip": 0.01134092, "auxiliary_loss_mlp": 0.00776277, "balance_loss_clip": 1.05406988, "balance_loss_mlp": 1.00129211, "epoch": 0.2188486397114084, "flos": 25557301353600.0, "grad_norm": 1.9800691484462982, "language_loss": 0.79167712, "learning_rate": 3.637403835405024e-06, "loss": 0.81078082, "num_input_tokens_seen": 78496135, "step": 3640, "time_per_iteration": 2.7559502124786377 }, { "auxiliary_loss_clip": 0.01149345, "auxiliary_loss_mlp": 0.01055855, "balance_loss_clip": 1.05816483, "balance_loss_mlp": 1.03617346, "epoch": 0.21890876296407635, "flos": 17892061056000.0, "grad_norm": 2.2045237000129942, "language_loss": 0.71708757, "learning_rate": 3.637180168162255e-06, "loss": 0.73913956, "num_input_tokens_seen": 78513855, "step": 3641, "time_per_iteration": 2.6673953533172607 }, { "auxiliary_loss_clip": 0.01130115, "auxiliary_loss_mlp": 0.0104373, "balance_loss_clip": 1.05217481, "balance_loss_mlp": 1.02593243, "epoch": 0.21896888621674432, "flos": 17749100926080.0, "grad_norm": 1.9358190088314053, "language_loss": 0.81427026, "learning_rate": 3.63695643883745e-06, "loss": 0.83600873, "num_input_tokens_seen": 78531740, "step": 3642, "time_per_iteration": 2.6722965240478516 }, { "auxiliary_loss_clip": 0.01150265, "auxiliary_loss_mlp": 0.01044184, "balance_loss_clip": 1.05707705, "balance_loss_mlp": 1.02520561, "epoch": 0.21902900946941228, "flos": 23076161400960.0, "grad_norm": 2.2890480980316865, "language_loss": 0.7124145, "learning_rate": 3.6367326474390928e-06, "loss": 0.73435903, "num_input_tokens_seen": 78549600, "step": 3643, "time_per_iteration": 2.6586625576019287 }, { "auxiliary_loss_clip": 0.01156283, "auxiliary_loss_mlp": 0.01046488, "balance_loss_clip": 1.05430686, "balance_loss_mlp": 1.02728367, "epoch": 0.21908913272208028, "flos": 48178545004800.0, "grad_norm": 2.705040309825256, "language_loss": 0.68497038, "learning_rate": 3.6365087939756696e-06, "loss": 0.70699811, "num_input_tokens_seen": 78573350, "step": 3644, "time_per_iteration": 2.835944414138794 }, { "auxiliary_loss_clip": 0.01157461, "auxiliary_loss_mlp": 0.01049851, "balance_loss_clip": 1.05381823, "balance_loss_mlp": 1.03175521, "epoch": 0.21914925597474824, "flos": 22236749493120.0, "grad_norm": 2.498314523319793, "language_loss": 0.77761143, "learning_rate": 3.636284878455669e-06, "loss": 0.79968452, "num_input_tokens_seen": 78591005, "step": 3645, "time_per_iteration": 2.6053528785705566 }, { "auxiliary_loss_clip": 0.01142456, "auxiliary_loss_mlp": 0.01054431, "balance_loss_clip": 1.05606842, "balance_loss_mlp": 1.03732491, "epoch": 0.2192093792274162, "flos": 22125605834880.0, "grad_norm": 3.1951942186566766, "language_loss": 0.82604313, "learning_rate": 3.636060900887582e-06, "loss": 0.84801197, "num_input_tokens_seen": 78610645, "step": 3646, "time_per_iteration": 4.198619842529297 }, { "auxiliary_loss_clip": 0.01141068, "auxiliary_loss_mlp": 0.01040772, "balance_loss_clip": 1.05287766, "balance_loss_mlp": 1.02365351, "epoch": 0.21926950248008417, "flos": 15669442264320.0, "grad_norm": 1.720246481727725, "language_loss": 0.82877636, "learning_rate": 3.635836861279901e-06, "loss": 0.85059476, "num_input_tokens_seen": 78628340, "step": 3647, "time_per_iteration": 4.229920387268066 }, { "auxiliary_loss_clip": 0.0115057, "auxiliary_loss_mlp": 0.01054202, "balance_loss_clip": 1.05145597, "balance_loss_mlp": 1.03685677, "epoch": 0.21932962573275214, "flos": 30262496641920.0, "grad_norm": 1.6932394069108108, "language_loss": 0.72652817, "learning_rate": 3.635612759641123e-06, "loss": 0.74857587, "num_input_tokens_seen": 78649355, "step": 3648, "time_per_iteration": 2.7226104736328125 }, { "auxiliary_loss_clip": 0.01110484, "auxiliary_loss_mlp": 0.01057841, "balance_loss_clip": 1.04757857, "balance_loss_mlp": 1.03643107, "epoch": 0.2193897489854201, "flos": 10780132838400.0, "grad_norm": 3.9115777702699175, "language_loss": 0.74917972, "learning_rate": 3.635388595979745e-06, "loss": 0.77086294, "num_input_tokens_seen": 78664915, "step": 3649, "time_per_iteration": 4.201031446456909 }, { "auxiliary_loss_clip": 0.01138726, "auxiliary_loss_mlp": 0.0105421, "balance_loss_clip": 1.0536499, "balance_loss_mlp": 1.03718746, "epoch": 0.21944987223808807, "flos": 19133313390720.0, "grad_norm": 1.8914434058388716, "language_loss": 0.86353791, "learning_rate": 3.635164370304267e-06, "loss": 0.88546729, "num_input_tokens_seen": 78681475, "step": 3650, "time_per_iteration": 2.6061322689056396 }, { "auxiliary_loss_clip": 0.01130852, "auxiliary_loss_mlp": 0.01052398, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.03439701, "epoch": 0.21950999549075606, "flos": 22711093522560.0, "grad_norm": 2.798139483493165, "language_loss": 0.83541161, "learning_rate": 3.6349400826231927e-06, "loss": 0.85724407, "num_input_tokens_seen": 78702300, "step": 3651, "time_per_iteration": 2.7605133056640625 }, { "auxiliary_loss_clip": 0.01143643, "auxiliary_loss_mlp": 0.0105251, "balance_loss_clip": 1.05282581, "balance_loss_mlp": 1.03511763, "epoch": 0.21957011874342403, "flos": 10561329141120.0, "grad_norm": 1.9065881796375543, "language_loss": 0.74475014, "learning_rate": 3.634715732945027e-06, "loss": 0.76671165, "num_input_tokens_seen": 78720230, "step": 3652, "time_per_iteration": 2.597443103790283 }, { "auxiliary_loss_clip": 0.01038431, "auxiliary_loss_mlp": 0.01009267, "balance_loss_clip": 1.0361495, "balance_loss_mlp": 1.0068711, "epoch": 0.219630241996092, "flos": 65747913252480.0, "grad_norm": 0.7482502800744824, "language_loss": 0.51550615, "learning_rate": 3.6344913212782764e-06, "loss": 0.5359832, "num_input_tokens_seen": 78780200, "step": 3653, "time_per_iteration": 3.324497699737549 }, { "auxiliary_loss_clip": 0.01125533, "auxiliary_loss_mlp": 0.01062527, "balance_loss_clip": 1.05436754, "balance_loss_mlp": 1.04470527, "epoch": 0.21969036524875996, "flos": 23696518216320.0, "grad_norm": 1.9578946934595152, "language_loss": 0.75356162, "learning_rate": 3.6342668476314514e-06, "loss": 0.77544224, "num_input_tokens_seen": 78800575, "step": 3654, "time_per_iteration": 4.296064615249634 }, { "auxiliary_loss_clip": 0.01152337, "auxiliary_loss_mlp": 0.01051249, "balance_loss_clip": 1.05944824, "balance_loss_mlp": 1.03376114, "epoch": 0.21975048850142792, "flos": 19640910435840.0, "grad_norm": 1.8387519277823352, "language_loss": 0.72646022, "learning_rate": 3.634042312013064e-06, "loss": 0.74849606, "num_input_tokens_seen": 78819585, "step": 3655, "time_per_iteration": 2.6634860038757324 }, { "auxiliary_loss_clip": 0.01130021, "auxiliary_loss_mlp": 0.01048784, "balance_loss_clip": 1.05423379, "balance_loss_mlp": 1.03071189, "epoch": 0.21981061175409589, "flos": 22448550038400.0, "grad_norm": 1.722985511504472, "language_loss": 0.80795759, "learning_rate": 3.6338177144316276e-06, "loss": 0.82974565, "num_input_tokens_seen": 78837330, "step": 3656, "time_per_iteration": 2.730391502380371 }, { "auxiliary_loss_clip": 0.01124773, "auxiliary_loss_mlp": 0.00776202, "balance_loss_clip": 1.06113994, "balance_loss_mlp": 1.00139225, "epoch": 0.21987073500676388, "flos": 18151049093760.0, "grad_norm": 2.646453773467974, "language_loss": 0.84885842, "learning_rate": 3.63359305489566e-06, "loss": 0.86786819, "num_input_tokens_seen": 78854955, "step": 3657, "time_per_iteration": 2.657607078552246 }, { "auxiliary_loss_clip": 0.01142645, "auxiliary_loss_mlp": 0.01040533, "balance_loss_clip": 1.05631852, "balance_loss_mlp": 1.02260423, "epoch": 0.21993085825943184, "flos": 25626177682560.0, "grad_norm": 2.6990832263195585, "language_loss": 0.80355585, "learning_rate": 3.6333683334136803e-06, "loss": 0.82538766, "num_input_tokens_seen": 78874965, "step": 3658, "time_per_iteration": 2.6584107875823975 }, { "auxiliary_loss_clip": 0.01048937, "auxiliary_loss_mlp": 0.0100499, "balance_loss_clip": 1.03857517, "balance_loss_mlp": 1.00202215, "epoch": 0.2199909815120998, "flos": 70923217743360.0, "grad_norm": 0.7788612160796681, "language_loss": 0.58191586, "learning_rate": 3.6331435499942095e-06, "loss": 0.60245514, "num_input_tokens_seen": 78937740, "step": 3659, "time_per_iteration": 3.3395371437072754 }, { "auxiliary_loss_clip": 0.01111007, "auxiliary_loss_mlp": 0.0105329, "balance_loss_clip": 1.05029392, "balance_loss_mlp": 1.03471744, "epoch": 0.22005110476476777, "flos": 21543529939200.0, "grad_norm": 4.382741616753977, "language_loss": 0.7477597, "learning_rate": 3.632918704645772e-06, "loss": 0.76940262, "num_input_tokens_seen": 78955055, "step": 3660, "time_per_iteration": 2.782975435256958 }, { "auxiliary_loss_clip": 0.01147277, "auxiliary_loss_mlp": 0.01044652, "balance_loss_clip": 1.05691171, "balance_loss_mlp": 1.02653265, "epoch": 0.22011122801743574, "flos": 22054502862720.0, "grad_norm": 1.8856077512582532, "language_loss": 0.81484449, "learning_rate": 3.632693797376893e-06, "loss": 0.83676374, "num_input_tokens_seen": 78974895, "step": 3661, "time_per_iteration": 2.7780110836029053 }, { "auxiliary_loss_clip": 0.01126694, "auxiliary_loss_mlp": 0.01056397, "balance_loss_clip": 1.05167532, "balance_loss_mlp": 1.03800273, "epoch": 0.2201713512701037, "flos": 26687589598080.0, "grad_norm": 1.9746283079458686, "language_loss": 0.73154199, "learning_rate": 3.632468828196102e-06, "loss": 0.75337297, "num_input_tokens_seen": 78994990, "step": 3662, "time_per_iteration": 2.7189040184020996 }, { "auxiliary_loss_clip": 0.0113519, "auxiliary_loss_mlp": 0.01051686, "balance_loss_clip": 1.05718994, "balance_loss_mlp": 1.03555691, "epoch": 0.22023147452277167, "flos": 22162198815360.0, "grad_norm": 2.0576168655035714, "language_loss": 0.78066969, "learning_rate": 3.632243797111929e-06, "loss": 0.80253839, "num_input_tokens_seen": 79014405, "step": 3663, "time_per_iteration": 2.731412410736084 }, { "auxiliary_loss_clip": 0.01142837, "auxiliary_loss_mlp": 0.01063521, "balance_loss_clip": 1.05659413, "balance_loss_mlp": 1.04352939, "epoch": 0.22029159777543966, "flos": 22523280284160.0, "grad_norm": 1.752119258875799, "language_loss": 0.80294079, "learning_rate": 3.632018704132908e-06, "loss": 0.82500434, "num_input_tokens_seen": 79032375, "step": 3664, "time_per_iteration": 2.7043297290802 }, { "auxiliary_loss_clip": 0.01134207, "auxiliary_loss_mlp": 0.01044352, "balance_loss_clip": 1.05424213, "balance_loss_mlp": 1.02474177, "epoch": 0.22035172102810763, "flos": 13042469093760.0, "grad_norm": 3.138103913885462, "language_loss": 0.76388288, "learning_rate": 3.6317935492675742e-06, "loss": 0.78566849, "num_input_tokens_seen": 79049635, "step": 3665, "time_per_iteration": 2.68300199508667 }, { "auxiliary_loss_clip": 0.01128405, "auxiliary_loss_mlp": 0.01053304, "balance_loss_clip": 1.05599689, "balance_loss_mlp": 1.03589976, "epoch": 0.2204118442807756, "flos": 12165817760640.0, "grad_norm": 2.9738702224471583, "language_loss": 0.9800086, "learning_rate": 3.631568332524466e-06, "loss": 1.00182581, "num_input_tokens_seen": 79062890, "step": 3666, "time_per_iteration": 2.702584981918335 }, { "auxiliary_loss_clip": 0.01141573, "auxiliary_loss_mlp": 0.00776689, "balance_loss_clip": 1.05254698, "balance_loss_mlp": 1.00133562, "epoch": 0.22047196753344356, "flos": 40108806673920.0, "grad_norm": 1.894759892223008, "language_loss": 0.80946934, "learning_rate": 3.631343053912122e-06, "loss": 0.82865196, "num_input_tokens_seen": 79085495, "step": 3667, "time_per_iteration": 2.8920814990997314 }, { "auxiliary_loss_clip": 0.01149896, "auxiliary_loss_mlp": 0.01051036, "balance_loss_clip": 1.06145239, "balance_loss_mlp": 1.03161693, "epoch": 0.22053209078611152, "flos": 20701137202560.0, "grad_norm": 1.8771463594277091, "language_loss": 0.7736783, "learning_rate": 3.631117713439087e-06, "loss": 0.79568756, "num_input_tokens_seen": 79101820, "step": 3668, "time_per_iteration": 2.6733500957489014 }, { "auxiliary_loss_clip": 0.01143618, "auxiliary_loss_mlp": 0.01047462, "balance_loss_clip": 1.05955744, "balance_loss_mlp": 1.02972412, "epoch": 0.2205922140387795, "flos": 24716309247360.0, "grad_norm": 1.7809066581326154, "language_loss": 0.71624571, "learning_rate": 3.630892311113904e-06, "loss": 0.7381565, "num_input_tokens_seen": 79123320, "step": 3669, "time_per_iteration": 2.7298974990844727 }, { "auxiliary_loss_clip": 0.01155448, "auxiliary_loss_mlp": 0.01039044, "balance_loss_clip": 1.0544126, "balance_loss_mlp": 1.0217346, "epoch": 0.22065233729144745, "flos": 23477247642240.0, "grad_norm": 2.1257290130035082, "language_loss": 0.85160267, "learning_rate": 3.6306668469451215e-06, "loss": 0.87354761, "num_input_tokens_seen": 79141615, "step": 3670, "time_per_iteration": 2.6624948978424072 }, { "auxiliary_loss_clip": 0.01137906, "auxiliary_loss_mlp": 0.01042298, "balance_loss_clip": 1.05475712, "balance_loss_mlp": 1.02376091, "epoch": 0.22071246054411545, "flos": 35225566646400.0, "grad_norm": 1.8008957470192373, "language_loss": 0.76928926, "learning_rate": 3.6304413209412886e-06, "loss": 0.79109132, "num_input_tokens_seen": 79164910, "step": 3671, "time_per_iteration": 2.7914648056030273 }, { "auxiliary_loss_clip": 0.01126159, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.05423856, "balance_loss_mlp": 1.02281129, "epoch": 0.2207725837967834, "flos": 18150294908160.0, "grad_norm": 2.015071454696955, "language_loss": 0.80643147, "learning_rate": 3.6302157331109573e-06, "loss": 0.82810068, "num_input_tokens_seen": 79179685, "step": 3672, "time_per_iteration": 2.674381732940674 }, { "auxiliary_loss_clip": 0.01149005, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.05706501, "balance_loss_mlp": 1.02992952, "epoch": 0.22083270704945138, "flos": 20479675898880.0, "grad_norm": 2.222038104071356, "language_loss": 0.73278964, "learning_rate": 3.629990083462682e-06, "loss": 0.75475204, "num_input_tokens_seen": 79196285, "step": 3673, "time_per_iteration": 2.6856846809387207 }, { "auxiliary_loss_clip": 0.01121745, "auxiliary_loss_mlp": 0.01044908, "balance_loss_clip": 1.05473876, "balance_loss_mlp": 1.02608538, "epoch": 0.22089283030211934, "flos": 34125801984000.0, "grad_norm": 1.9530426336903413, "language_loss": 0.76384282, "learning_rate": 3.6297643720050203e-06, "loss": 0.78550935, "num_input_tokens_seen": 79216060, "step": 3674, "time_per_iteration": 2.816190242767334 }, { "auxiliary_loss_clip": 0.01156134, "auxiliary_loss_mlp": 0.01047969, "balance_loss_clip": 1.05650616, "balance_loss_mlp": 1.02850175, "epoch": 0.2209529535547873, "flos": 18077216688000.0, "grad_norm": 2.045565300481816, "language_loss": 0.74367136, "learning_rate": 3.6295385987465293e-06, "loss": 0.76571238, "num_input_tokens_seen": 79235145, "step": 3675, "time_per_iteration": 2.69748592376709 }, { "auxiliary_loss_clip": 0.01155113, "auxiliary_loss_mlp": 0.01045626, "balance_loss_clip": 1.05442023, "balance_loss_mlp": 1.02800727, "epoch": 0.22101307680745527, "flos": 27235335070080.0, "grad_norm": 1.898816078558846, "language_loss": 0.79801333, "learning_rate": 3.629312763695772e-06, "loss": 0.82002068, "num_input_tokens_seen": 79256960, "step": 3676, "time_per_iteration": 2.6792948246002197 }, { "auxiliary_loss_clip": 0.01133095, "auxiliary_loss_mlp": 0.01049823, "balance_loss_clip": 1.05366707, "balance_loss_mlp": 1.03257358, "epoch": 0.22107320006012326, "flos": 16543256423040.0, "grad_norm": 2.1537198076644954, "language_loss": 0.75327688, "learning_rate": 3.6290868668613107e-06, "loss": 0.77510607, "num_input_tokens_seen": 79274860, "step": 3677, "time_per_iteration": 2.781393527984619 }, { "auxiliary_loss_clip": 0.0111612, "auxiliary_loss_mlp": 0.01050059, "balance_loss_clip": 1.04986429, "balance_loss_mlp": 1.03212988, "epoch": 0.22113332331279123, "flos": 22054466949120.0, "grad_norm": 1.7875463894855461, "language_loss": 0.83287871, "learning_rate": 3.628860908251712e-06, "loss": 0.85454059, "num_input_tokens_seen": 79294005, "step": 3678, "time_per_iteration": 2.752838611602783 }, { "auxiliary_loss_clip": 0.01094052, "auxiliary_loss_mlp": 0.01058605, "balance_loss_clip": 1.04951406, "balance_loss_mlp": 1.03992522, "epoch": 0.2211934465654592, "flos": 26612787525120.0, "grad_norm": 1.6742153249136704, "language_loss": 0.89135075, "learning_rate": 3.6286348878755452e-06, "loss": 0.91287732, "num_input_tokens_seen": 79314005, "step": 3679, "time_per_iteration": 2.8282527923583984 }, { "auxiliary_loss_clip": 0.01147641, "auxiliary_loss_mlp": 0.01054276, "balance_loss_clip": 1.05507338, "balance_loss_mlp": 1.03615618, "epoch": 0.22125356981812716, "flos": 16360363347840.0, "grad_norm": 3.092644946410345, "language_loss": 0.8649044, "learning_rate": 3.6284088057413803e-06, "loss": 0.88692355, "num_input_tokens_seen": 79331030, "step": 3680, "time_per_iteration": 2.630829095840454 }, { "auxiliary_loss_clip": 0.0111249, "auxiliary_loss_mlp": 0.01052062, "balance_loss_clip": 1.05374503, "balance_loss_mlp": 1.03395414, "epoch": 0.22131369307079513, "flos": 21651118151040.0, "grad_norm": 1.9427224492838853, "language_loss": 0.81773758, "learning_rate": 3.6281826618577894e-06, "loss": 0.83938313, "num_input_tokens_seen": 79348560, "step": 3681, "time_per_iteration": 2.805880069732666 }, { "auxiliary_loss_clip": 0.01148508, "auxiliary_loss_mlp": 0.00775652, "balance_loss_clip": 1.0530386, "balance_loss_mlp": 1.00146043, "epoch": 0.2213738163234631, "flos": 19609524927360.0, "grad_norm": 2.296553230959153, "language_loss": 0.80099678, "learning_rate": 3.62795645623335e-06, "loss": 0.82023835, "num_input_tokens_seen": 79367175, "step": 3682, "time_per_iteration": 2.624234199523926 }, { "auxiliary_loss_clip": 0.0112405, "auxiliary_loss_mlp": 0.0105126, "balance_loss_clip": 1.0500052, "balance_loss_mlp": 1.03198409, "epoch": 0.22143393957613106, "flos": 23623404082560.0, "grad_norm": 1.6781760642146926, "language_loss": 0.77394038, "learning_rate": 3.627730188876638e-06, "loss": 0.7956934, "num_input_tokens_seen": 79388435, "step": 3683, "time_per_iteration": 2.6746323108673096 }, { "auxiliary_loss_clip": 0.01129753, "auxiliary_loss_mlp": 0.01051291, "balance_loss_clip": 1.05048668, "balance_loss_mlp": 1.03411245, "epoch": 0.22149406282879905, "flos": 26177801823360.0, "grad_norm": 2.1201256685163323, "language_loss": 0.72406399, "learning_rate": 3.627503859796234e-06, "loss": 0.7458744, "num_input_tokens_seen": 79407910, "step": 3684, "time_per_iteration": 2.695958375930786 }, { "auxiliary_loss_clip": 0.01084051, "auxiliary_loss_mlp": 0.01045612, "balance_loss_clip": 1.04670835, "balance_loss_mlp": 1.02571654, "epoch": 0.221554186081467, "flos": 14538758970240.0, "grad_norm": 2.1308896442870893, "language_loss": 0.79817796, "learning_rate": 3.6272774690007207e-06, "loss": 0.81947458, "num_input_tokens_seen": 79424020, "step": 3685, "time_per_iteration": 2.7443795204162598 }, { "auxiliary_loss_clip": 0.01147394, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.05201805, "balance_loss_mlp": 1.02867222, "epoch": 0.22161430933413498, "flos": 22238257864320.0, "grad_norm": 1.6870532517893482, "language_loss": 0.87305272, "learning_rate": 3.6270510164986823e-06, "loss": 0.89498115, "num_input_tokens_seen": 79445605, "step": 3686, "time_per_iteration": 4.388494968414307 }, { "auxiliary_loss_clip": 0.01137917, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.052562, "balance_loss_mlp": 1.02620554, "epoch": 0.22167443258680294, "flos": 23476529370240.0, "grad_norm": 1.8821221420403713, "language_loss": 0.78069639, "learning_rate": 3.626824502298707e-06, "loss": 0.80251229, "num_input_tokens_seen": 79463850, "step": 3687, "time_per_iteration": 4.123531103134155 }, { "auxiliary_loss_clip": 0.0112545, "auxiliary_loss_mlp": 0.01052599, "balance_loss_clip": 1.0494144, "balance_loss_mlp": 1.0331558, "epoch": 0.2217345558394709, "flos": 23221132692480.0, "grad_norm": 1.8251811803295879, "language_loss": 0.84860861, "learning_rate": 3.626597926409383e-06, "loss": 0.8703891, "num_input_tokens_seen": 79482845, "step": 3688, "time_per_iteration": 4.287938594818115 }, { "auxiliary_loss_clip": 0.01110764, "auxiliary_loss_mlp": 0.01051634, "balance_loss_clip": 1.04967332, "balance_loss_mlp": 1.03254843, "epoch": 0.22179467909213887, "flos": 20011078045440.0, "grad_norm": 1.7785994747216247, "language_loss": 0.81150943, "learning_rate": 3.6263712888393027e-06, "loss": 0.83313334, "num_input_tokens_seen": 79501550, "step": 3689, "time_per_iteration": 2.7521302700042725 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.01048971, "balance_loss_clip": 1.05078936, "balance_loss_mlp": 1.03131568, "epoch": 0.22185480234480687, "flos": 19683034110720.0, "grad_norm": 1.7481542974535997, "language_loss": 0.70018351, "learning_rate": 3.626144589597061e-06, "loss": 0.72191954, "num_input_tokens_seen": 79519680, "step": 3690, "time_per_iteration": 2.6664223670959473 }, { "auxiliary_loss_clip": 0.01147193, "auxiliary_loss_mlp": 0.00777365, "balance_loss_clip": 1.0537169, "balance_loss_mlp": 1.00153625, "epoch": 0.22191492559747483, "flos": 21981316901760.0, "grad_norm": 1.8112729447523994, "language_loss": 0.72609359, "learning_rate": 3.6259178286912528e-06, "loss": 0.74533916, "num_input_tokens_seen": 79539000, "step": 3691, "time_per_iteration": 2.6724495887756348 }, { "auxiliary_loss_clip": 0.01144688, "auxiliary_loss_mlp": 0.01046427, "balance_loss_clip": 1.05663919, "balance_loss_mlp": 1.0275923, "epoch": 0.2219750488501428, "flos": 23222066446080.0, "grad_norm": 1.8134603978799304, "language_loss": 0.71503472, "learning_rate": 3.625691006130477e-06, "loss": 0.73694593, "num_input_tokens_seen": 79559695, "step": 3692, "time_per_iteration": 2.6743686199188232 }, { "auxiliary_loss_clip": 0.01147828, "auxiliary_loss_mlp": 0.01048973, "balance_loss_clip": 1.05410266, "balance_loss_mlp": 1.03098464, "epoch": 0.22203517210281076, "flos": 22453685683200.0, "grad_norm": 2.1147705582229577, "language_loss": 0.87551594, "learning_rate": 3.6254641219233362e-06, "loss": 0.89748394, "num_input_tokens_seen": 79579095, "step": 3693, "time_per_iteration": 4.2962939739227295 }, { "auxiliary_loss_clip": 0.01141134, "auxiliary_loss_mlp": 0.01041066, "balance_loss_clip": 1.0537045, "balance_loss_mlp": 1.02479386, "epoch": 0.22209529535547873, "flos": 17564555825280.0, "grad_norm": 1.9865017520636683, "language_loss": 0.85553116, "learning_rate": 3.6252371760784325e-06, "loss": 0.87735319, "num_input_tokens_seen": 79596430, "step": 3694, "time_per_iteration": 2.585657835006714 }, { "auxiliary_loss_clip": 0.01107468, "auxiliary_loss_mlp": 0.01045482, "balance_loss_clip": 1.04370403, "balance_loss_mlp": 1.02640843, "epoch": 0.2221554186081467, "flos": 21469015175040.0, "grad_norm": 2.1752375595399136, "language_loss": 0.68740189, "learning_rate": 3.6250101686043725e-06, "loss": 0.70893133, "num_input_tokens_seen": 79615825, "step": 3695, "time_per_iteration": 2.744264841079712 }, { "auxiliary_loss_clip": 0.01118075, "auxiliary_loss_mlp": 0.01047291, "balance_loss_clip": 1.051736, "balance_loss_mlp": 1.0310905, "epoch": 0.22221554186081466, "flos": 27673445255040.0, "grad_norm": 1.6851408018575031, "language_loss": 0.71540272, "learning_rate": 3.6247830995097637e-06, "loss": 0.73705637, "num_input_tokens_seen": 79637875, "step": 3696, "time_per_iteration": 2.7320780754089355 }, { "auxiliary_loss_clip": 0.01140935, "auxiliary_loss_mlp": 0.0104304, "balance_loss_clip": 1.05123305, "balance_loss_mlp": 1.02455115, "epoch": 0.22227566511348265, "flos": 25958926298880.0, "grad_norm": 1.7186386141421306, "language_loss": 0.87905443, "learning_rate": 3.624555968803217e-06, "loss": 0.90089417, "num_input_tokens_seen": 79656970, "step": 3697, "time_per_iteration": 2.65919828414917 }, { "auxiliary_loss_clip": 0.01118987, "auxiliary_loss_mlp": 0.0104214, "balance_loss_clip": 1.04718316, "balance_loss_mlp": 1.0255338, "epoch": 0.22233578836615062, "flos": 39203678833920.0, "grad_norm": 1.6515031384229777, "language_loss": 0.65900242, "learning_rate": 3.624328776493346e-06, "loss": 0.6806137, "num_input_tokens_seen": 79680275, "step": 3698, "time_per_iteration": 2.7708024978637695 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01049333, "balance_loss_clip": 1.05630088, "balance_loss_mlp": 1.03102303, "epoch": 0.22239591161881858, "flos": 36283782251520.0, "grad_norm": 1.9634592665257078, "language_loss": 0.82520199, "learning_rate": 3.6241015225887637e-06, "loss": 0.84712231, "num_input_tokens_seen": 79701255, "step": 3699, "time_per_iteration": 2.7743008136749268 }, { "auxiliary_loss_clip": 0.01129692, "auxiliary_loss_mlp": 0.01047594, "balance_loss_clip": 1.05154991, "balance_loss_mlp": 1.02939105, "epoch": 0.22245603487148655, "flos": 19719591177600.0, "grad_norm": 1.6711069078421557, "language_loss": 0.79384553, "learning_rate": 3.62387420709809e-06, "loss": 0.8156184, "num_input_tokens_seen": 79721315, "step": 3700, "time_per_iteration": 2.652172327041626 }, { "auxiliary_loss_clip": 0.01111144, "auxiliary_loss_mlp": 0.01045464, "balance_loss_clip": 1.04893112, "balance_loss_mlp": 1.02608061, "epoch": 0.2225161581241545, "flos": 46280450615040.0, "grad_norm": 2.123831341506728, "language_loss": 0.72503817, "learning_rate": 3.623646830029943e-06, "loss": 0.74660432, "num_input_tokens_seen": 79742705, "step": 3701, "time_per_iteration": 2.943124294281006 }, { "auxiliary_loss_clip": 0.01139412, "auxiliary_loss_mlp": 0.0104206, "balance_loss_clip": 1.05053067, "balance_loss_mlp": 1.0246197, "epoch": 0.22257628137682248, "flos": 23696194993920.0, "grad_norm": 1.9127522113256972, "language_loss": 0.79901838, "learning_rate": 3.6234193913929454e-06, "loss": 0.82083315, "num_input_tokens_seen": 79763000, "step": 3702, "time_per_iteration": 2.6978282928466797 }, { "auxiliary_loss_clip": 0.01129024, "auxiliary_loss_mlp": 0.01044082, "balance_loss_clip": 1.04707038, "balance_loss_mlp": 1.02655816, "epoch": 0.22263640462949044, "flos": 19353984595200.0, "grad_norm": 1.8258996761992496, "language_loss": 0.78237271, "learning_rate": 3.623191891195723e-06, "loss": 0.80410373, "num_input_tokens_seen": 79781335, "step": 3703, "time_per_iteration": 2.6528990268707275 }, { "auxiliary_loss_clip": 0.01140219, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.0503273, "balance_loss_mlp": 1.0171181, "epoch": 0.22269652788215843, "flos": 20776047016320.0, "grad_norm": 2.1693263198920563, "language_loss": 0.74490714, "learning_rate": 3.6229643294469005e-06, "loss": 0.76667851, "num_input_tokens_seen": 79800150, "step": 3704, "time_per_iteration": 2.679184913635254 }, { "auxiliary_loss_clip": 0.0110341, "auxiliary_loss_mlp": 0.01043861, "balance_loss_clip": 1.046996, "balance_loss_mlp": 1.02684951, "epoch": 0.2227566511348264, "flos": 47958843467520.0, "grad_norm": 1.8279463297536431, "language_loss": 0.644319, "learning_rate": 3.6227367061551074e-06, "loss": 0.66579175, "num_input_tokens_seen": 79822390, "step": 3705, "time_per_iteration": 2.972221612930298 }, { "auxiliary_loss_clip": 0.01037239, "auxiliary_loss_mlp": 0.01023153, "balance_loss_clip": 1.03748369, "balance_loss_mlp": 1.02111423, "epoch": 0.22281677438749437, "flos": 66218953230720.0, "grad_norm": 1.2472387125776994, "language_loss": 0.65169704, "learning_rate": 3.6225090213289766e-06, "loss": 0.67230093, "num_input_tokens_seen": 79873350, "step": 3706, "time_per_iteration": 3.118619203567505 }, { "auxiliary_loss_clip": 0.01116185, "auxiliary_loss_mlp": 0.01040401, "balance_loss_clip": 1.04938805, "balance_loss_mlp": 1.02290082, "epoch": 0.22287689764016233, "flos": 21871609787520.0, "grad_norm": 1.912279921070755, "language_loss": 0.80597419, "learning_rate": 3.622281274977141e-06, "loss": 0.8275401, "num_input_tokens_seen": 79891715, "step": 3707, "time_per_iteration": 2.6555368900299072 }, { "auxiliary_loss_clip": 0.01149897, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.02203059, "epoch": 0.2229370208928303, "flos": 27672475587840.0, "grad_norm": 1.9339558574691282, "language_loss": 0.78542316, "learning_rate": 3.6220534671082367e-06, "loss": 0.80731529, "num_input_tokens_seen": 79911175, "step": 3708, "time_per_iteration": 2.7179131507873535 }, { "auxiliary_loss_clip": 0.01128276, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.05055118, "balance_loss_mlp": 1.02363038, "epoch": 0.22299714414549826, "flos": 30154657034880.0, "grad_norm": 1.8085596793383067, "language_loss": 0.80606776, "learning_rate": 3.6218255977309024e-06, "loss": 0.82777578, "num_input_tokens_seen": 79931875, "step": 3709, "time_per_iteration": 2.810605764389038 }, { "auxiliary_loss_clip": 0.01135044, "auxiliary_loss_mlp": 0.00777248, "balance_loss_clip": 1.0480969, "balance_loss_mlp": 1.0014261, "epoch": 0.22305726739816625, "flos": 23143134309120.0, "grad_norm": 2.100780376064183, "language_loss": 0.69068789, "learning_rate": 3.6215976668537787e-06, "loss": 0.70981085, "num_input_tokens_seen": 79952445, "step": 3710, "time_per_iteration": 2.7197980880737305 }, { "auxiliary_loss_clip": 0.01111671, "auxiliary_loss_mlp": 0.01050475, "balance_loss_clip": 1.04630041, "balance_loss_mlp": 1.03220057, "epoch": 0.22311739065083422, "flos": 19172061187200.0, "grad_norm": 2.1025491711486763, "language_loss": 0.90782154, "learning_rate": 3.6213696744855096e-06, "loss": 0.92944294, "num_input_tokens_seen": 79971030, "step": 3711, "time_per_iteration": 2.808014154434204 }, { "auxiliary_loss_clip": 0.01117969, "auxiliary_loss_mlp": 0.01059175, "balance_loss_clip": 1.04696095, "balance_loss_mlp": 1.03921938, "epoch": 0.22317751390350218, "flos": 13617757319040.0, "grad_norm": 6.2447945102939615, "language_loss": 0.89070308, "learning_rate": 3.6211416206347395e-06, "loss": 0.91247451, "num_input_tokens_seen": 79982085, "step": 3712, "time_per_iteration": 2.6701955795288086 }, { "auxiliary_loss_clip": 0.01150852, "auxiliary_loss_mlp": 0.01044271, "balance_loss_clip": 1.05445373, "balance_loss_mlp": 1.02627039, "epoch": 0.22323763715617015, "flos": 11029065068160.0, "grad_norm": 5.249819485386642, "language_loss": 0.75858659, "learning_rate": 3.620913505310117e-06, "loss": 0.78053784, "num_input_tokens_seen": 79997460, "step": 3713, "time_per_iteration": 2.5961148738861084 }, { "auxiliary_loss_clip": 0.01106588, "auxiliary_loss_mlp": 0.01043158, "balance_loss_clip": 1.05345535, "balance_loss_mlp": 1.0252645, "epoch": 0.22329776040883811, "flos": 41351531466240.0, "grad_norm": 1.7774284049242903, "language_loss": 0.62422931, "learning_rate": 3.6206853285202917e-06, "loss": 0.6457268, "num_input_tokens_seen": 80022450, "step": 3714, "time_per_iteration": 2.9655838012695312 }, { "auxiliary_loss_clip": 0.0112071, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.05258489, "balance_loss_mlp": 1.0163759, "epoch": 0.22335788366150608, "flos": 25119478477440.0, "grad_norm": 5.465931600334143, "language_loss": 0.79076529, "learning_rate": 3.6204570902739164e-06, "loss": 0.81230301, "num_input_tokens_seen": 80042100, "step": 3715, "time_per_iteration": 2.8040106296539307 }, { "auxiliary_loss_clip": 0.01113318, "auxiliary_loss_mlp": 0.01049585, "balance_loss_clip": 1.05601192, "balance_loss_mlp": 1.03176367, "epoch": 0.22341800691417404, "flos": 16983377769600.0, "grad_norm": 2.696607190089822, "language_loss": 0.77416688, "learning_rate": 3.620228790579645e-06, "loss": 0.79579592, "num_input_tokens_seen": 80059690, "step": 3716, "time_per_iteration": 2.721008777618408 }, { "auxiliary_loss_clip": 0.01123787, "auxiliary_loss_mlp": 0.01043954, "balance_loss_clip": 1.04860306, "balance_loss_mlp": 1.02644157, "epoch": 0.22347813016684204, "flos": 14136738975360.0, "grad_norm": 3.4762745813408884, "language_loss": 0.79258984, "learning_rate": 3.6200004294461367e-06, "loss": 0.81426722, "num_input_tokens_seen": 80076060, "step": 3717, "time_per_iteration": 2.724637746810913 }, { "auxiliary_loss_clip": 0.0107853, "auxiliary_loss_mlp": 0.01042478, "balance_loss_clip": 1.04485083, "balance_loss_mlp": 1.02390504, "epoch": 0.22353825341951, "flos": 23583147914880.0, "grad_norm": 1.9798483733973138, "language_loss": 0.67890245, "learning_rate": 3.6197720068820497e-06, "loss": 0.70011252, "num_input_tokens_seen": 80094760, "step": 3718, "time_per_iteration": 2.8178799152374268 }, { "auxiliary_loss_clip": 0.01128946, "auxiliary_loss_mlp": 0.01043035, "balance_loss_clip": 1.04887676, "balance_loss_mlp": 1.02374721, "epoch": 0.22359837667217797, "flos": 29824206888960.0, "grad_norm": 1.6261924310986715, "language_loss": 0.81046188, "learning_rate": 3.619543522896045e-06, "loss": 0.83218175, "num_input_tokens_seen": 80114475, "step": 3719, "time_per_iteration": 2.8068079948425293 }, { "auxiliary_loss_clip": 0.0112823, "auxiliary_loss_mlp": 0.0105526, "balance_loss_clip": 1.05054009, "balance_loss_mlp": 1.03555441, "epoch": 0.22365849992484593, "flos": 17603088140160.0, "grad_norm": 2.128611791985372, "language_loss": 0.86535168, "learning_rate": 3.6193149774967885e-06, "loss": 0.88718653, "num_input_tokens_seen": 80132920, "step": 3720, "time_per_iteration": 2.726252794265747 }, { "auxiliary_loss_clip": 0.01123833, "auxiliary_loss_mlp": 0.01039252, "balance_loss_clip": 1.05347347, "balance_loss_mlp": 1.0207628, "epoch": 0.2237186231775139, "flos": 22710949868160.0, "grad_norm": 1.725668609175168, "language_loss": 0.7471531, "learning_rate": 3.619086370692945e-06, "loss": 0.76878393, "num_input_tokens_seen": 80152845, "step": 3721, "time_per_iteration": 2.77329158782959 }, { "auxiliary_loss_clip": 0.01158005, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.05607998, "balance_loss_mlp": 1.02497673, "epoch": 0.22377874643018186, "flos": 13371518609280.0, "grad_norm": 3.166607303525693, "language_loss": 0.7957024, "learning_rate": 3.6188577024931844e-06, "loss": 0.8177169, "num_input_tokens_seen": 80170680, "step": 3722, "time_per_iteration": 2.7204909324645996 }, { "auxiliary_loss_clip": 0.01113056, "auxiliary_loss_mlp": 0.01041868, "balance_loss_clip": 1.0520618, "balance_loss_mlp": 1.02571511, "epoch": 0.22383886968284986, "flos": 17894970057600.0, "grad_norm": 2.0043774256219997, "language_loss": 0.82129884, "learning_rate": 3.618628972906178e-06, "loss": 0.84284806, "num_input_tokens_seen": 80189030, "step": 3723, "time_per_iteration": 2.7908549308776855 }, { "auxiliary_loss_clip": 0.01155309, "auxiliary_loss_mlp": 0.01046826, "balance_loss_clip": 1.05468059, "balance_loss_mlp": 1.02857494, "epoch": 0.22389899293551782, "flos": 23879123982720.0, "grad_norm": 2.0838579777085022, "language_loss": 0.84742224, "learning_rate": 3.6184001819405984e-06, "loss": 0.86944354, "num_input_tokens_seen": 80208365, "step": 3724, "time_per_iteration": 2.691678047180176 }, { "auxiliary_loss_clip": 0.01123425, "auxiliary_loss_mlp": 0.01042537, "balance_loss_clip": 1.0494504, "balance_loss_mlp": 1.02516866, "epoch": 0.2239591161881858, "flos": 27272430840960.0, "grad_norm": 1.76453761267329, "language_loss": 0.79456621, "learning_rate": 3.618171329605121e-06, "loss": 0.81622583, "num_input_tokens_seen": 80228685, "step": 3725, "time_per_iteration": 4.339299917221069 }, { "auxiliary_loss_clip": 0.01091555, "auxiliary_loss_mlp": 0.01043361, "balance_loss_clip": 1.05116296, "balance_loss_mlp": 1.02538443, "epoch": 0.22401923944085375, "flos": 22236857233920.0, "grad_norm": 1.776149940187026, "language_loss": 0.77333415, "learning_rate": 3.6179424159084254e-06, "loss": 0.79468334, "num_input_tokens_seen": 80247635, "step": 3726, "time_per_iteration": 4.320322275161743 }, { "auxiliary_loss_clip": 0.0115151, "auxiliary_loss_mlp": 0.01047267, "balance_loss_clip": 1.05424356, "balance_loss_mlp": 1.02664328, "epoch": 0.22407936269352172, "flos": 12053668521600.0, "grad_norm": 2.83844669603944, "language_loss": 0.72643399, "learning_rate": 3.6177134408591914e-06, "loss": 0.74842173, "num_input_tokens_seen": 80260045, "step": 3727, "time_per_iteration": 4.218656539916992 }, { "auxiliary_loss_clip": 0.01157504, "auxiliary_loss_mlp": 0.01043436, "balance_loss_clip": 1.0541296, "balance_loss_mlp": 1.02321815, "epoch": 0.22413948594618968, "flos": 19353553632000.0, "grad_norm": 2.250671737688348, "language_loss": 0.86600292, "learning_rate": 3.6174844044661013e-06, "loss": 0.88801229, "num_input_tokens_seen": 80277680, "step": 3728, "time_per_iteration": 2.650423765182495 }, { "auxiliary_loss_clip": 0.01122602, "auxiliary_loss_mlp": 0.01053562, "balance_loss_clip": 1.050982, "balance_loss_mlp": 1.03134131, "epoch": 0.22419960919885765, "flos": 24170000319360.0, "grad_norm": 2.1953419048873877, "language_loss": 0.80038953, "learning_rate": 3.6172553067378406e-06, "loss": 0.82215106, "num_input_tokens_seen": 80294795, "step": 3729, "time_per_iteration": 2.7553794384002686 }, { "auxiliary_loss_clip": 0.01126228, "auxiliary_loss_mlp": 0.01046911, "balance_loss_clip": 1.05183935, "balance_loss_mlp": 1.02992368, "epoch": 0.22425973245152564, "flos": 27378977558400.0, "grad_norm": 1.8211738544282683, "language_loss": 0.86968076, "learning_rate": 3.6170261476830964e-06, "loss": 0.89141214, "num_input_tokens_seen": 80315425, "step": 3730, "time_per_iteration": 2.8044395446777344 }, { "auxiliary_loss_clip": 0.01121982, "auxiliary_loss_mlp": 0.00775761, "balance_loss_clip": 1.04924226, "balance_loss_mlp": 1.00148201, "epoch": 0.2243198557041936, "flos": 13735652734080.0, "grad_norm": 2.1817469574553017, "language_loss": 0.73091185, "learning_rate": 3.616796927310559e-06, "loss": 0.74988931, "num_input_tokens_seen": 80333905, "step": 3731, "time_per_iteration": 2.764198064804077 }, { "auxiliary_loss_clip": 0.01127044, "auxiliary_loss_mlp": 0.0104235, "balance_loss_clip": 1.05654919, "balance_loss_mlp": 1.02467108, "epoch": 0.22437997895686157, "flos": 19530700531200.0, "grad_norm": 2.1924274894904787, "language_loss": 0.75427651, "learning_rate": 3.6165676456289195e-06, "loss": 0.77597046, "num_input_tokens_seen": 80352165, "step": 3732, "time_per_iteration": 4.544835090637207 }, { "auxiliary_loss_clip": 0.01155285, "auxiliary_loss_mlp": 0.01053522, "balance_loss_clip": 1.05655456, "balance_loss_mlp": 1.03560436, "epoch": 0.22444010220952954, "flos": 23696230907520.0, "grad_norm": 1.745203479087184, "language_loss": 0.88139856, "learning_rate": 3.616338302646873e-06, "loss": 0.90348667, "num_input_tokens_seen": 80371305, "step": 3733, "time_per_iteration": 2.7097933292388916 }, { "auxiliary_loss_clip": 0.0110922, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.05094051, "balance_loss_mlp": 1.02264953, "epoch": 0.2245002254621975, "flos": 22382905933440.0, "grad_norm": 1.6873732683679492, "language_loss": 0.84643197, "learning_rate": 3.6161088983731166e-06, "loss": 0.86793089, "num_input_tokens_seen": 80391020, "step": 3734, "time_per_iteration": 2.7647547721862793 }, { "auxiliary_loss_clip": 0.0113181, "auxiliary_loss_mlp": 0.01049327, "balance_loss_clip": 1.05362856, "balance_loss_mlp": 1.03149319, "epoch": 0.22456034871486547, "flos": 26942303917440.0, "grad_norm": 1.774553175519815, "language_loss": 0.7679311, "learning_rate": 3.6158794328163482e-06, "loss": 0.78974247, "num_input_tokens_seen": 80411365, "step": 3735, "time_per_iteration": 2.7682430744171143 }, { "auxiliary_loss_clip": 0.01138858, "auxiliary_loss_mlp": 0.01045746, "balance_loss_clip": 1.06029248, "balance_loss_mlp": 1.02927136, "epoch": 0.22462047196753343, "flos": 28983538005120.0, "grad_norm": 1.671324371931155, "language_loss": 0.842767, "learning_rate": 3.6156499059852702e-06, "loss": 0.86461306, "num_input_tokens_seen": 80431075, "step": 3736, "time_per_iteration": 3.009368419647217 }, { "auxiliary_loss_clip": 0.0111279, "auxiliary_loss_mlp": 0.01044111, "balance_loss_clip": 1.05240226, "balance_loss_mlp": 1.02677774, "epoch": 0.22468059522020142, "flos": 20011329440640.0, "grad_norm": 1.8971112354532307, "language_loss": 0.86643183, "learning_rate": 3.615420317888586e-06, "loss": 0.88800085, "num_input_tokens_seen": 80449240, "step": 3737, "time_per_iteration": 2.792965888977051 }, { "auxiliary_loss_clip": 0.0115891, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.05792093, "balance_loss_mlp": 1.03051496, "epoch": 0.2247407184728694, "flos": 29314239546240.0, "grad_norm": 6.664079021041442, "language_loss": 0.79027152, "learning_rate": 3.6151906685350006e-06, "loss": 0.81235784, "num_input_tokens_seen": 80467900, "step": 3738, "time_per_iteration": 2.716878652572632 }, { "auxiliary_loss_clip": 0.01122737, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.0520165, "balance_loss_mlp": 1.0315063, "epoch": 0.22480084172553735, "flos": 22310366417280.0, "grad_norm": 1.837059456311059, "language_loss": 0.76693523, "learning_rate": 3.614960957933224e-06, "loss": 0.78864253, "num_input_tokens_seen": 80487100, "step": 3739, "time_per_iteration": 2.743222713470459 }, { "auxiliary_loss_clip": 0.01116493, "auxiliary_loss_mlp": 0.01049772, "balance_loss_clip": 1.05008686, "balance_loss_mlp": 1.03011417, "epoch": 0.22486096497820532, "flos": 25591272641280.0, "grad_norm": 2.2924613412630133, "language_loss": 0.74577379, "learning_rate": 3.6147311860919655e-06, "loss": 0.7674365, "num_input_tokens_seen": 80508625, "step": 3740, "time_per_iteration": 2.7339253425598145 }, { "auxiliary_loss_clip": 0.01152276, "auxiliary_loss_mlp": 0.01045147, "balance_loss_clip": 1.05556941, "balance_loss_mlp": 1.02728927, "epoch": 0.22492108823087328, "flos": 17639824775040.0, "grad_norm": 1.9086069443180373, "language_loss": 0.75610423, "learning_rate": 3.614501353019939e-06, "loss": 0.77807844, "num_input_tokens_seen": 80527345, "step": 3741, "time_per_iteration": 2.7347571849823 }, { "auxiliary_loss_clip": 0.01133279, "auxiliary_loss_mlp": 0.01039745, "balance_loss_clip": 1.05599904, "balance_loss_mlp": 1.02316284, "epoch": 0.22498121148354125, "flos": 16034653797120.0, "grad_norm": 1.7754272123040742, "language_loss": 0.87332213, "learning_rate": 3.6142714587258592e-06, "loss": 0.89505225, "num_input_tokens_seen": 80545545, "step": 3742, "time_per_iteration": 2.702103614807129 }, { "auxiliary_loss_clip": 0.01095068, "auxiliary_loss_mlp": 0.01053093, "balance_loss_clip": 1.04728913, "balance_loss_mlp": 1.03398395, "epoch": 0.22504133473620924, "flos": 24023772051840.0, "grad_norm": 2.1035678371185256, "language_loss": 0.812823, "learning_rate": 3.614041503218444e-06, "loss": 0.83430457, "num_input_tokens_seen": 80565040, "step": 3743, "time_per_iteration": 2.777566909790039 }, { "auxiliary_loss_clip": 0.01142483, "auxiliary_loss_mlp": 0.01040692, "balance_loss_clip": 1.05282855, "balance_loss_mlp": 1.02319252, "epoch": 0.2251014579888772, "flos": 16763963541120.0, "grad_norm": 2.836562973763206, "language_loss": 0.63821399, "learning_rate": 3.6138114865064134e-06, "loss": 0.66004574, "num_input_tokens_seen": 80582815, "step": 3744, "time_per_iteration": 2.6738698482513428 }, { "auxiliary_loss_clip": 0.01139201, "auxiliary_loss_mlp": 0.01043137, "balance_loss_clip": 1.05523586, "balance_loss_mlp": 1.0255779, "epoch": 0.22516158124154517, "flos": 13991013498240.0, "grad_norm": 4.405698565190268, "language_loss": 0.76340199, "learning_rate": 3.613581408598489e-06, "loss": 0.78522527, "num_input_tokens_seen": 80600865, "step": 3745, "time_per_iteration": 2.8423044681549072 }, { "auxiliary_loss_clip": 0.01116037, "auxiliary_loss_mlp": 0.0104407, "balance_loss_clip": 1.04906797, "balance_loss_mlp": 1.0267489, "epoch": 0.22522170449421314, "flos": 14390016750720.0, "grad_norm": 7.51155110796741, "language_loss": 0.8056733, "learning_rate": 3.6133512695033965e-06, "loss": 0.82727438, "num_input_tokens_seen": 80617455, "step": 3746, "time_per_iteration": 2.743417739868164 }, { "auxiliary_loss_clip": 0.01142091, "auxiliary_loss_mlp": 0.01050597, "balance_loss_clip": 1.05323768, "balance_loss_mlp": 1.0328114, "epoch": 0.2252818277468811, "flos": 23805542972160.0, "grad_norm": 2.6189948571262116, "language_loss": 0.86153656, "learning_rate": 3.613121069229862e-06, "loss": 0.88346344, "num_input_tokens_seen": 80635125, "step": 3747, "time_per_iteration": 2.7622148990631104 }, { "auxiliary_loss_clip": 0.01138021, "auxiliary_loss_mlp": 0.0077598, "balance_loss_clip": 1.05126321, "balance_loss_mlp": 1.00154519, "epoch": 0.22534195099954907, "flos": 24718033100160.0, "grad_norm": 2.3477587169419483, "language_loss": 0.76400602, "learning_rate": 3.6128908077866145e-06, "loss": 0.78314602, "num_input_tokens_seen": 80656370, "step": 3748, "time_per_iteration": 2.7347261905670166 }, { "auxiliary_loss_clip": 0.01156837, "auxiliary_loss_mlp": 0.01043045, "balance_loss_clip": 1.05704546, "balance_loss_mlp": 1.02525926, "epoch": 0.22540207425221703, "flos": 21032341534080.0, "grad_norm": 1.5503962030073002, "language_loss": 0.7984724, "learning_rate": 3.6126604851823864e-06, "loss": 0.82047117, "num_input_tokens_seen": 80676495, "step": 3749, "time_per_iteration": 2.6900558471679688 }, { "auxiliary_loss_clip": 0.01123701, "auxiliary_loss_mlp": 0.01041028, "balance_loss_clip": 1.05050755, "balance_loss_mlp": 1.02436304, "epoch": 0.22546219750488503, "flos": 19390362094080.0, "grad_norm": 3.015206251853355, "language_loss": 0.79585081, "learning_rate": 3.6124301014259108e-06, "loss": 0.81749809, "num_input_tokens_seen": 80694755, "step": 3750, "time_per_iteration": 2.727651596069336 }, { "auxiliary_loss_clip": 0.01097337, "auxiliary_loss_mlp": 0.01055462, "balance_loss_clip": 1.05065274, "balance_loss_mlp": 1.03756917, "epoch": 0.225522320757553, "flos": 25192628524800.0, "grad_norm": 2.662961533862713, "language_loss": 0.82433236, "learning_rate": 3.6121996565259244e-06, "loss": 0.84586036, "num_input_tokens_seen": 80713670, "step": 3751, "time_per_iteration": 2.827995538711548 }, { "auxiliary_loss_clip": 0.01121046, "auxiliary_loss_mlp": 0.01046103, "balance_loss_clip": 1.05429292, "balance_loss_mlp": 1.02828133, "epoch": 0.22558244401022096, "flos": 17163110448000.0, "grad_norm": 2.0142745824369315, "language_loss": 0.83813727, "learning_rate": 3.611969150491165e-06, "loss": 0.8598088, "num_input_tokens_seen": 80731450, "step": 3752, "time_per_iteration": 2.78725266456604 }, { "auxiliary_loss_clip": 0.01152116, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.05584741, "balance_loss_mlp": 1.02123034, "epoch": 0.22564256726288892, "flos": 15231008856960.0, "grad_norm": 1.9292267305553392, "language_loss": 0.78254855, "learning_rate": 3.611738583330375e-06, "loss": 0.80444479, "num_input_tokens_seen": 80748415, "step": 3753, "time_per_iteration": 2.7116169929504395 }, { "auxiliary_loss_clip": 0.01126321, "auxiliary_loss_mlp": 0.0104341, "balance_loss_clip": 1.05120027, "balance_loss_mlp": 1.02546871, "epoch": 0.2257026905155569, "flos": 34568652764160.0, "grad_norm": 1.8777790089425805, "language_loss": 0.78391469, "learning_rate": 3.611507955052295e-06, "loss": 0.80561191, "num_input_tokens_seen": 80770835, "step": 3754, "time_per_iteration": 2.91738224029541 }, { "auxiliary_loss_clip": 0.01128102, "auxiliary_loss_mlp": 0.01048192, "balance_loss_clip": 1.05648673, "balance_loss_mlp": 1.03040624, "epoch": 0.22576281376822485, "flos": 19938430788480.0, "grad_norm": 1.9337610105869587, "language_loss": 0.70648986, "learning_rate": 3.6112772656656727e-06, "loss": 0.72825277, "num_input_tokens_seen": 80787840, "step": 3755, "time_per_iteration": 2.7427992820739746 }, { "auxiliary_loss_clip": 0.01126515, "auxiliary_loss_mlp": 0.01053366, "balance_loss_clip": 1.05531752, "balance_loss_mlp": 1.03559232, "epoch": 0.22582293702089282, "flos": 24602005192320.0, "grad_norm": 3.9817469401483216, "language_loss": 0.77865845, "learning_rate": 3.6110465151792547e-06, "loss": 0.80045724, "num_input_tokens_seen": 80806335, "step": 3756, "time_per_iteration": 2.7879996299743652 }, { "auxiliary_loss_clip": 0.01132066, "auxiliary_loss_mlp": 0.01044227, "balance_loss_clip": 1.0559032, "balance_loss_mlp": 1.0261426, "epoch": 0.2258830602735608, "flos": 23035438356480.0, "grad_norm": 1.801741818571408, "language_loss": 0.82615864, "learning_rate": 3.6108157036017916e-06, "loss": 0.84792161, "num_input_tokens_seen": 80825355, "step": 3757, "time_per_iteration": 2.685218095779419 }, { "auxiliary_loss_clip": 0.01140048, "auxiliary_loss_mlp": 0.01047555, "balance_loss_clip": 1.05321026, "balance_loss_mlp": 1.02917302, "epoch": 0.22594318352622877, "flos": 22158427887360.0, "grad_norm": 2.3786564016745495, "language_loss": 0.73007452, "learning_rate": 3.6105848309420358e-06, "loss": 0.7519505, "num_input_tokens_seen": 80842570, "step": 3758, "time_per_iteration": 2.6716878414154053 }, { "auxiliary_loss_clip": 0.01137739, "auxiliary_loss_mlp": 0.01048984, "balance_loss_clip": 1.0577718, "balance_loss_mlp": 1.03019619, "epoch": 0.22600330677889674, "flos": 20594303176320.0, "grad_norm": 2.226232476294752, "language_loss": 0.77150333, "learning_rate": 3.6103538972087412e-06, "loss": 0.79337054, "num_input_tokens_seen": 80858745, "step": 3759, "time_per_iteration": 2.787487030029297 }, { "auxiliary_loss_clip": 0.01104852, "auxiliary_loss_mlp": 0.01043473, "balance_loss_clip": 1.04747176, "balance_loss_mlp": 1.02507949, "epoch": 0.2260634300315647, "flos": 35659798162560.0, "grad_norm": 1.6253921855068183, "language_loss": 0.78189945, "learning_rate": 3.6101229024106655e-06, "loss": 0.80338269, "num_input_tokens_seen": 80880085, "step": 3760, "time_per_iteration": 2.8760766983032227 }, { "auxiliary_loss_clip": 0.01042849, "auxiliary_loss_mlp": 0.01009599, "balance_loss_clip": 1.03235281, "balance_loss_mlp": 1.00633264, "epoch": 0.22612355328423267, "flos": 72090455126400.0, "grad_norm": 0.9481639821873915, "language_loss": 0.60083473, "learning_rate": 3.609891846556569e-06, "loss": 0.62135923, "num_input_tokens_seen": 80937660, "step": 3761, "time_per_iteration": 3.2168753147125244 }, { "auxiliary_loss_clip": 0.01114836, "auxiliary_loss_mlp": 0.01051216, "balance_loss_clip": 1.0493567, "balance_loss_mlp": 1.03295338, "epoch": 0.22618367653690064, "flos": 22783776693120.0, "grad_norm": 2.3328987294287047, "language_loss": 0.76767397, "learning_rate": 3.609660729655211e-06, "loss": 0.78933448, "num_input_tokens_seen": 80956265, "step": 3762, "time_per_iteration": 2.8012428283691406 }, { "auxiliary_loss_clip": 0.01128732, "auxiliary_loss_mlp": 0.01042327, "balance_loss_clip": 1.05266595, "balance_loss_mlp": 1.02190685, "epoch": 0.22624379978956863, "flos": 20448254476800.0, "grad_norm": 2.7297545785195907, "language_loss": 0.79000401, "learning_rate": 3.6094295517153573e-06, "loss": 0.81171465, "num_input_tokens_seen": 80975185, "step": 3763, "time_per_iteration": 2.7217857837677 }, { "auxiliary_loss_clip": 0.01142679, "auxiliary_loss_mlp": 0.01057425, "balance_loss_clip": 1.0557214, "balance_loss_mlp": 1.03835177, "epoch": 0.2263039230422366, "flos": 17494314779520.0, "grad_norm": 31.68022075556768, "language_loss": 0.91241246, "learning_rate": 3.6091983127457743e-06, "loss": 0.93441343, "num_input_tokens_seen": 80992830, "step": 3764, "time_per_iteration": 4.232046842575073 }, { "auxiliary_loss_clip": 0.01131876, "auxiliary_loss_mlp": 0.01055516, "balance_loss_clip": 1.05196834, "balance_loss_mlp": 1.0367409, "epoch": 0.22636404629490456, "flos": 28329748606080.0, "grad_norm": 1.9816130101247444, "language_loss": 0.75202596, "learning_rate": 3.6089670127552293e-06, "loss": 0.77389991, "num_input_tokens_seen": 81013675, "step": 3765, "time_per_iteration": 4.291628122329712 }, { "auxiliary_loss_clip": 0.01140284, "auxiliary_loss_mlp": 0.01047009, "balance_loss_clip": 1.05632913, "balance_loss_mlp": 1.02942574, "epoch": 0.22642416954757252, "flos": 17489143221120.0, "grad_norm": 2.1881182413466176, "language_loss": 0.8966549, "learning_rate": 3.608735651752494e-06, "loss": 0.91852784, "num_input_tokens_seen": 81030345, "step": 3766, "time_per_iteration": 2.6462960243225098 }, { "auxiliary_loss_clip": 0.01126107, "auxiliary_loss_mlp": 0.01047462, "balance_loss_clip": 1.05579042, "balance_loss_mlp": 1.02950931, "epoch": 0.2264842928002405, "flos": 24384530298240.0, "grad_norm": 1.6297384952566736, "language_loss": 0.74816859, "learning_rate": 3.6085042297463417e-06, "loss": 0.76990426, "num_input_tokens_seen": 81051000, "step": 3767, "time_per_iteration": 4.181917667388916 }, { "auxiliary_loss_clip": 0.01139766, "auxiliary_loss_mlp": 0.01048037, "balance_loss_clip": 1.05206823, "balance_loss_mlp": 1.02981031, "epoch": 0.22654441605290845, "flos": 19830519354240.0, "grad_norm": 1.6389844555489992, "language_loss": 0.71764815, "learning_rate": 3.6082727467455477e-06, "loss": 0.73952615, "num_input_tokens_seen": 81071205, "step": 3768, "time_per_iteration": 2.6622893810272217 }, { "auxiliary_loss_clip": 0.01143239, "auxiliary_loss_mlp": 0.01057198, "balance_loss_clip": 1.05766034, "balance_loss_mlp": 1.03895879, "epoch": 0.22660453930557642, "flos": 27454569730560.0, "grad_norm": 1.5883345705718652, "language_loss": 0.78320074, "learning_rate": 3.6080412027588905e-06, "loss": 0.80520505, "num_input_tokens_seen": 81091880, "step": 3769, "time_per_iteration": 2.692366123199463 }, { "auxiliary_loss_clip": 0.01121985, "auxiliary_loss_mlp": 0.01045951, "balance_loss_clip": 1.0452522, "balance_loss_mlp": 1.02712774, "epoch": 0.2266646625582444, "flos": 23988148738560.0, "grad_norm": 1.8427419299971495, "language_loss": 0.6877771, "learning_rate": 3.6078095977951488e-06, "loss": 0.70945644, "num_input_tokens_seen": 81113290, "step": 3770, "time_per_iteration": 2.7605137825012207 }, { "auxiliary_loss_clip": 0.01155061, "auxiliary_loss_mlp": 0.01053072, "balance_loss_clip": 1.0551908, "balance_loss_mlp": 1.03454649, "epoch": 0.22672478581091238, "flos": 26028054023040.0, "grad_norm": 1.6594447480271795, "language_loss": 0.80540276, "learning_rate": 3.6075779318631067e-06, "loss": 0.82748413, "num_input_tokens_seen": 81133535, "step": 3771, "time_per_iteration": 4.265140771865845 }, { "auxiliary_loss_clip": 0.0110854, "auxiliary_loss_mlp": 0.01058177, "balance_loss_clip": 1.04661536, "balance_loss_mlp": 1.04091501, "epoch": 0.22678490906358034, "flos": 23841812730240.0, "grad_norm": 1.6696234119475444, "language_loss": 0.78947794, "learning_rate": 3.6073462049715486e-06, "loss": 0.81114507, "num_input_tokens_seen": 81154650, "step": 3772, "time_per_iteration": 2.7325806617736816 }, { "auxiliary_loss_clip": 0.01036659, "auxiliary_loss_mlp": 0.0100656, "balance_loss_clip": 1.0461247, "balance_loss_mlp": 1.00336492, "epoch": 0.2268450323162483, "flos": 65048088574080.0, "grad_norm": 0.653194629863103, "language_loss": 0.54380804, "learning_rate": 3.607114417129261e-06, "loss": 0.56424022, "num_input_tokens_seen": 81221240, "step": 3773, "time_per_iteration": 3.3729567527770996 }, { "auxiliary_loss_clip": 0.0111914, "auxiliary_loss_mlp": 0.01046238, "balance_loss_clip": 1.05257821, "balance_loss_mlp": 1.02851129, "epoch": 0.22690515556891627, "flos": 22526081544960.0, "grad_norm": 1.81548541557593, "language_loss": 0.70406783, "learning_rate": 3.6068825683450334e-06, "loss": 0.7257216, "num_input_tokens_seen": 81241520, "step": 3774, "time_per_iteration": 2.7159364223480225 }, { "auxiliary_loss_clip": 0.01125586, "auxiliary_loss_mlp": 0.01046805, "balance_loss_clip": 1.05404115, "balance_loss_mlp": 1.02929282, "epoch": 0.22696527882158424, "flos": 18223444955520.0, "grad_norm": 2.2603412716687523, "language_loss": 0.74377871, "learning_rate": 3.606650658627658e-06, "loss": 0.76550257, "num_input_tokens_seen": 81256825, "step": 3775, "time_per_iteration": 2.7857720851898193 }, { "auxiliary_loss_clip": 0.01152024, "auxiliary_loss_mlp": 0.01045868, "balance_loss_clip": 1.05331159, "balance_loss_mlp": 1.02915478, "epoch": 0.22702540207425223, "flos": 17019252478080.0, "grad_norm": 1.8428958927362264, "language_loss": 0.81582248, "learning_rate": 3.606418687985928e-06, "loss": 0.83780146, "num_input_tokens_seen": 81275695, "step": 3776, "time_per_iteration": 2.6054935455322266 }, { "auxiliary_loss_clip": 0.01135081, "auxiliary_loss_mlp": 0.01043769, "balance_loss_clip": 1.05466735, "balance_loss_mlp": 1.02654314, "epoch": 0.2270855253269202, "flos": 21325731822720.0, "grad_norm": 1.7711090356153572, "language_loss": 0.82893199, "learning_rate": 3.606186656428641e-06, "loss": 0.85072052, "num_input_tokens_seen": 81294920, "step": 3777, "time_per_iteration": 2.722621202468872 }, { "auxiliary_loss_clip": 0.01127657, "auxiliary_loss_mlp": 0.01042436, "balance_loss_clip": 1.05438471, "balance_loss_mlp": 1.02435195, "epoch": 0.22714564857958816, "flos": 23550469516800.0, "grad_norm": 2.3905711679994295, "language_loss": 0.72538829, "learning_rate": 3.6059545639645955e-06, "loss": 0.74708927, "num_input_tokens_seen": 81314275, "step": 3778, "time_per_iteration": 2.730919599533081 }, { "auxiliary_loss_clip": 0.01112853, "auxiliary_loss_mlp": 0.01040216, "balance_loss_clip": 1.05304575, "balance_loss_mlp": 1.02241838, "epoch": 0.22720577183225613, "flos": 25989880844160.0, "grad_norm": 2.4150679449588535, "language_loss": 0.64176035, "learning_rate": 3.605722410602591e-06, "loss": 0.66329098, "num_input_tokens_seen": 81333890, "step": 3779, "time_per_iteration": 2.7663822174072266 }, { "auxiliary_loss_clip": 0.01132359, "auxiliary_loss_mlp": 0.01047274, "balance_loss_clip": 1.05292106, "balance_loss_mlp": 1.02928495, "epoch": 0.2272658950849241, "flos": 20814076540800.0, "grad_norm": 1.6627524387617407, "language_loss": 0.70659381, "learning_rate": 3.6054901963514323e-06, "loss": 0.72839016, "num_input_tokens_seen": 81353640, "step": 3780, "time_per_iteration": 2.666081666946411 }, { "auxiliary_loss_clip": 0.0114157, "auxiliary_loss_mlp": 0.01046965, "balance_loss_clip": 1.05450416, "balance_loss_mlp": 1.02880907, "epoch": 0.22732601833759206, "flos": 23909324342400.0, "grad_norm": 1.783300050979337, "language_loss": 0.89418924, "learning_rate": 3.6052579212199246e-06, "loss": 0.91607457, "num_input_tokens_seen": 81371595, "step": 3781, "time_per_iteration": 2.686478614807129 }, { "auxiliary_loss_clip": 0.01152428, "auxiliary_loss_mlp": 0.01041162, "balance_loss_clip": 1.05349672, "balance_loss_mlp": 1.02354264, "epoch": 0.22738614159026002, "flos": 15924407978880.0, "grad_norm": 19.977426185094338, "language_loss": 0.74404943, "learning_rate": 3.6050255852168753e-06, "loss": 0.76598531, "num_input_tokens_seen": 81388435, "step": 3782, "time_per_iteration": 2.5633177757263184 }, { "auxiliary_loss_clip": 0.01129007, "auxiliary_loss_mlp": 0.01045443, "balance_loss_clip": 1.05195391, "balance_loss_mlp": 1.02926588, "epoch": 0.22744626484292801, "flos": 24205515891840.0, "grad_norm": 2.051662638457334, "language_loss": 0.82665169, "learning_rate": 3.604793188351095e-06, "loss": 0.84839618, "num_input_tokens_seen": 81410195, "step": 3783, "time_per_iteration": 2.742572069168091 }, { "auxiliary_loss_clip": 0.01129724, "auxiliary_loss_mlp": 0.01043254, "balance_loss_clip": 1.055516, "balance_loss_mlp": 1.02495527, "epoch": 0.22750638809559598, "flos": 24791614110720.0, "grad_norm": 2.0126417567412256, "language_loss": 0.75996566, "learning_rate": 3.6045607306313964e-06, "loss": 0.78169543, "num_input_tokens_seen": 81430060, "step": 3784, "time_per_iteration": 2.7283668518066406 }, { "auxiliary_loss_clip": 0.01148666, "auxiliary_loss_mlp": 0.01041397, "balance_loss_clip": 1.05224681, "balance_loss_mlp": 1.02382576, "epoch": 0.22756651134826394, "flos": 22236498097920.0, "grad_norm": 1.784429661746796, "language_loss": 0.7105484, "learning_rate": 3.604328212066594e-06, "loss": 0.73244894, "num_input_tokens_seen": 81447375, "step": 3785, "time_per_iteration": 2.627401351928711 }, { "auxiliary_loss_clip": 0.01042691, "auxiliary_loss_mlp": 0.0101642, "balance_loss_clip": 1.03303862, "balance_loss_mlp": 1.01427412, "epoch": 0.2276266346009319, "flos": 62707466626560.0, "grad_norm": 0.8323137639565091, "language_loss": 0.6189881, "learning_rate": 3.6040956326655047e-06, "loss": 0.63957924, "num_input_tokens_seen": 81505235, "step": 3786, "time_per_iteration": 3.321380376815796 }, { "auxiliary_loss_clip": 0.01135149, "auxiliary_loss_mlp": 0.01044526, "balance_loss_clip": 1.0540669, "balance_loss_mlp": 1.02645397, "epoch": 0.22768675785359987, "flos": 18613936684800.0, "grad_norm": 2.677223616893363, "language_loss": 0.86047274, "learning_rate": 3.6038629924369486e-06, "loss": 0.8822695, "num_input_tokens_seen": 81518685, "step": 3787, "time_per_iteration": 2.72554349899292 }, { "auxiliary_loss_clip": 0.01129718, "auxiliary_loss_mlp": 0.01039908, "balance_loss_clip": 1.05296564, "balance_loss_mlp": 1.02323031, "epoch": 0.22774688110626784, "flos": 26870195364480.0, "grad_norm": 1.361320938410825, "language_loss": 0.72755021, "learning_rate": 3.6036302913897474e-06, "loss": 0.74924648, "num_input_tokens_seen": 81538940, "step": 3788, "time_per_iteration": 2.7717456817626953 }, { "auxiliary_loss_clip": 0.01125411, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.05099773, "balance_loss_mlp": 1.01800895, "epoch": 0.2278070043589358, "flos": 15553593924480.0, "grad_norm": 2.510042380876752, "language_loss": 0.67785919, "learning_rate": 3.6033975295327243e-06, "loss": 0.69946766, "num_input_tokens_seen": 81555525, "step": 3789, "time_per_iteration": 2.6492021083831787 }, { "auxiliary_loss_clip": 0.01114067, "auxiliary_loss_mlp": 0.01042939, "balance_loss_clip": 1.04577208, "balance_loss_mlp": 1.0244137, "epoch": 0.2278671276116038, "flos": 22416805393920.0, "grad_norm": 2.807016388048184, "language_loss": 0.76026487, "learning_rate": 3.6031647068747065e-06, "loss": 0.7818349, "num_input_tokens_seen": 81576305, "step": 3790, "time_per_iteration": 2.789419412612915 }, { "auxiliary_loss_clip": 0.01094774, "auxiliary_loss_mlp": 0.01043575, "balance_loss_clip": 1.04942632, "balance_loss_mlp": 1.02388144, "epoch": 0.22792725086427176, "flos": 20631363033600.0, "grad_norm": 2.1998519418279843, "language_loss": 0.9070015, "learning_rate": 3.602931823424522e-06, "loss": 0.92838502, "num_input_tokens_seen": 81594115, "step": 3791, "time_per_iteration": 2.74957275390625 }, { "auxiliary_loss_clip": 0.01143903, "auxiliary_loss_mlp": 0.01039768, "balance_loss_clip": 1.05332911, "balance_loss_mlp": 1.02229166, "epoch": 0.22798737411693973, "flos": 31428946903680.0, "grad_norm": 1.6288404079645773, "language_loss": 0.82029706, "learning_rate": 3.6026988791910026e-06, "loss": 0.84213376, "num_input_tokens_seen": 81615355, "step": 3792, "time_per_iteration": 2.7578563690185547 }, { "auxiliary_loss_clip": 0.01074793, "auxiliary_loss_mlp": 0.01002047, "balance_loss_clip": 1.03528738, "balance_loss_mlp": 0.99944824, "epoch": 0.2280474973696077, "flos": 52396685827200.0, "grad_norm": 1.1490057531785423, "language_loss": 0.65688264, "learning_rate": 3.602465874182981e-06, "loss": 0.67765105, "num_input_tokens_seen": 81662075, "step": 3793, "time_per_iteration": 2.892385959625244 }, { "auxiliary_loss_clip": 0.01156846, "auxiliary_loss_mlp": 0.01048751, "balance_loss_clip": 1.05509233, "balance_loss_mlp": 1.03063166, "epoch": 0.22810762062227566, "flos": 26396066816640.0, "grad_norm": 2.315054268007893, "language_loss": 0.77095032, "learning_rate": 3.602232808409293e-06, "loss": 0.79300624, "num_input_tokens_seen": 81681625, "step": 3794, "time_per_iteration": 2.6432933807373047 }, { "auxiliary_loss_clip": 0.01106797, "auxiliary_loss_mlp": 0.0104554, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.02560771, "epoch": 0.22816774387494362, "flos": 25630271832960.0, "grad_norm": 2.8263872836139194, "language_loss": 0.80649161, "learning_rate": 3.6019996818787755e-06, "loss": 0.82801497, "num_input_tokens_seen": 81701170, "step": 3795, "time_per_iteration": 2.748461961746216 }, { "auxiliary_loss_clip": 0.01136851, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.0527277, "balance_loss_mlp": 1.03194404, "epoch": 0.22822786712761162, "flos": 22451602694400.0, "grad_norm": 1.970346796529307, "language_loss": 0.77348727, "learning_rate": 3.6017664946002704e-06, "loss": 0.79534674, "num_input_tokens_seen": 81721265, "step": 3796, "time_per_iteration": 2.6720409393310547 }, { "auxiliary_loss_clip": 0.01111647, "auxiliary_loss_mlp": 0.0077572, "balance_loss_clip": 1.04920197, "balance_loss_mlp": 1.00161827, "epoch": 0.22828799038027958, "flos": 12202554395520.0, "grad_norm": 3.9384070064251793, "language_loss": 0.95837742, "learning_rate": 3.6015332465826188e-06, "loss": 0.97725105, "num_input_tokens_seen": 81736565, "step": 3797, "time_per_iteration": 2.730684995651245 }, { "auxiliary_loss_clip": 0.01140956, "auxiliary_loss_mlp": 0.00774906, "balance_loss_clip": 1.05310869, "balance_loss_mlp": 1.00178146, "epoch": 0.22834811363294755, "flos": 22085708803200.0, "grad_norm": 2.215225796779507, "language_loss": 0.81875294, "learning_rate": 3.601299937834666e-06, "loss": 0.83791155, "num_input_tokens_seen": 81756240, "step": 3798, "time_per_iteration": 2.7082717418670654 }, { "auxiliary_loss_clip": 0.01113838, "auxiliary_loss_mlp": 0.01041342, "balance_loss_clip": 1.04808974, "balance_loss_mlp": 1.02263761, "epoch": 0.2284082368856155, "flos": 24860634094080.0, "grad_norm": 2.1089113145856344, "language_loss": 0.78796971, "learning_rate": 3.6010665683652596e-06, "loss": 0.8095215, "num_input_tokens_seen": 81775720, "step": 3799, "time_per_iteration": 2.7810587882995605 }, { "auxiliary_loss_clip": 0.01121546, "auxiliary_loss_mlp": 0.01055329, "balance_loss_clip": 1.04926765, "balance_loss_mlp": 1.03627968, "epoch": 0.22846836013828348, "flos": 23292882109440.0, "grad_norm": 1.7973625036918341, "language_loss": 0.75191152, "learning_rate": 3.6008331381832484e-06, "loss": 0.77368033, "num_input_tokens_seen": 81795830, "step": 3800, "time_per_iteration": 2.7185163497924805 }, { "auxiliary_loss_clip": 0.01121477, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.04833913, "balance_loss_mlp": 1.02235246, "epoch": 0.22852848339095144, "flos": 27416288810880.0, "grad_norm": 1.7410667809724167, "language_loss": 0.64073247, "learning_rate": 3.600599647297484e-06, "loss": 0.66232693, "num_input_tokens_seen": 81815745, "step": 3801, "time_per_iteration": 2.7509078979492188 }, { "auxiliary_loss_clip": 0.01129432, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.05498147, "balance_loss_mlp": 1.02301216, "epoch": 0.2285886066436194, "flos": 26321157002880.0, "grad_norm": 1.6732672610702524, "language_loss": 0.81560862, "learning_rate": 3.60036609571682e-06, "loss": 0.83729029, "num_input_tokens_seen": 81835155, "step": 3802, "time_per_iteration": 2.7188339233398438 }, { "auxiliary_loss_clip": 0.01126952, "auxiliary_loss_mlp": 0.0105215, "balance_loss_clip": 1.05203629, "balance_loss_mlp": 1.0342809, "epoch": 0.2286487298962874, "flos": 29716475022720.0, "grad_norm": 2.0652844737971625, "language_loss": 0.78909743, "learning_rate": 3.600132483450114e-06, "loss": 0.81088841, "num_input_tokens_seen": 81855655, "step": 3803, "time_per_iteration": 2.7760777473449707 }, { "auxiliary_loss_clip": 0.01109356, "auxiliary_loss_mlp": 0.01043096, "balance_loss_clip": 1.04399478, "balance_loss_mlp": 1.02511966, "epoch": 0.22870885314895537, "flos": 21287199507840.0, "grad_norm": 1.7519930287683254, "language_loss": 0.84902716, "learning_rate": 3.5998988105062235e-06, "loss": 0.87055165, "num_input_tokens_seen": 81876385, "step": 3804, "time_per_iteration": 5.891911745071411 }, { "auxiliary_loss_clip": 0.01141965, "auxiliary_loss_mlp": 0.01040951, "balance_loss_clip": 1.05229163, "balance_loss_mlp": 1.02440476, "epoch": 0.22876897640162333, "flos": 14939450161920.0, "grad_norm": 2.045415026345325, "language_loss": 0.76673448, "learning_rate": 3.59966507689401e-06, "loss": 0.78856367, "num_input_tokens_seen": 81893225, "step": 3805, "time_per_iteration": 2.643104076385498 }, { "auxiliary_loss_clip": 0.0112853, "auxiliary_loss_mlp": 0.00775286, "balance_loss_clip": 1.05192351, "balance_loss_mlp": 1.00156116, "epoch": 0.2288290996542913, "flos": 18113917409280.0, "grad_norm": 2.368547935700865, "language_loss": 0.78250653, "learning_rate": 3.5994312826223363e-06, "loss": 0.80154467, "num_input_tokens_seen": 81911350, "step": 3806, "time_per_iteration": 4.312817335128784 }, { "auxiliary_loss_clip": 0.01123441, "auxiliary_loss_mlp": 0.01052484, "balance_loss_clip": 1.05244482, "balance_loss_mlp": 1.03282619, "epoch": 0.22888922290695926, "flos": 39855457071360.0, "grad_norm": 2.0706298183861, "language_loss": 0.700813, "learning_rate": 3.5991974277000684e-06, "loss": 0.72257227, "num_input_tokens_seen": 81935420, "step": 3807, "time_per_iteration": 2.8060836791992188 }, { "auxiliary_loss_clip": 0.01143724, "auxiliary_loss_mlp": 0.01057417, "balance_loss_clip": 1.0545013, "balance_loss_mlp": 1.03891551, "epoch": 0.22894934615962723, "flos": 23403774372480.0, "grad_norm": 4.007429648995762, "language_loss": 0.6543591, "learning_rate": 3.5989635121360733e-06, "loss": 0.6763705, "num_input_tokens_seen": 81953845, "step": 3808, "time_per_iteration": 2.703885078430176 }, { "auxiliary_loss_clip": 0.0109921, "auxiliary_loss_mlp": 0.01061828, "balance_loss_clip": 1.04773676, "balance_loss_mlp": 1.04295671, "epoch": 0.22900946941229522, "flos": 18843011671680.0, "grad_norm": 2.028069656557901, "language_loss": 0.74749511, "learning_rate": 3.598729535939222e-06, "loss": 0.76910543, "num_input_tokens_seen": 81972100, "step": 3809, "time_per_iteration": 2.726862907409668 }, { "auxiliary_loss_clip": 0.01128097, "auxiliary_loss_mlp": 0.01053112, "balance_loss_clip": 1.0527637, "balance_loss_mlp": 1.03666139, "epoch": 0.22906959266496318, "flos": 22929394429440.0, "grad_norm": 1.6287389468918274, "language_loss": 0.81654954, "learning_rate": 3.5984954991183862e-06, "loss": 0.83836162, "num_input_tokens_seen": 81992760, "step": 3810, "time_per_iteration": 2.6750009059906006 }, { "auxiliary_loss_clip": 0.01132496, "auxiliary_loss_mlp": 0.01040979, "balance_loss_clip": 1.05216146, "balance_loss_mlp": 1.0247184, "epoch": 0.22912971591763115, "flos": 19354523299200.0, "grad_norm": 2.375204791625097, "language_loss": 0.78126299, "learning_rate": 3.598261401682441e-06, "loss": 0.80299771, "num_input_tokens_seen": 82009080, "step": 3811, "time_per_iteration": 4.302153587341309 }, { "auxiliary_loss_clip": 0.01130856, "auxiliary_loss_mlp": 0.00775213, "balance_loss_clip": 1.05357778, "balance_loss_mlp": 1.00159776, "epoch": 0.22918983917029911, "flos": 19933546538880.0, "grad_norm": 1.797699433224321, "language_loss": 0.82817954, "learning_rate": 3.5980272436402632e-06, "loss": 0.84724021, "num_input_tokens_seen": 82026705, "step": 3812, "time_per_iteration": 2.635796308517456 }, { "auxiliary_loss_clip": 0.01089198, "auxiliary_loss_mlp": 0.01067747, "balance_loss_clip": 1.04705882, "balance_loss_mlp": 1.0480535, "epoch": 0.22924996242296708, "flos": 16690885320960.0, "grad_norm": 3.3357789636694952, "language_loss": 0.82689399, "learning_rate": 3.5977930250007324e-06, "loss": 0.84846342, "num_input_tokens_seen": 82043245, "step": 3813, "time_per_iteration": 2.7896463871002197 }, { "auxiliary_loss_clip": 0.01135441, "auxiliary_loss_mlp": 0.01044219, "balance_loss_clip": 1.05230987, "balance_loss_mlp": 1.02743411, "epoch": 0.22931008567563504, "flos": 33036164956800.0, "grad_norm": 1.5779710642832598, "language_loss": 0.70018709, "learning_rate": 3.5975587457727298e-06, "loss": 0.72198373, "num_input_tokens_seen": 82066870, "step": 3814, "time_per_iteration": 2.759460687637329 }, { "auxiliary_loss_clip": 0.01141204, "auxiliary_loss_mlp": 0.01046745, "balance_loss_clip": 1.05307984, "balance_loss_mlp": 1.02947164, "epoch": 0.229370208928303, "flos": 23330696152320.0, "grad_norm": 2.3195881009003174, "language_loss": 0.66811371, "learning_rate": 3.597324405965139e-06, "loss": 0.6899932, "num_input_tokens_seen": 82083180, "step": 3815, "time_per_iteration": 2.6878743171691895 }, { "auxiliary_loss_clip": 0.01142177, "auxiliary_loss_mlp": 0.01045942, "balance_loss_clip": 1.05412412, "balance_loss_mlp": 1.02921689, "epoch": 0.229430332180971, "flos": 28617213150720.0, "grad_norm": 2.436037188170917, "language_loss": 0.83555114, "learning_rate": 3.597090005586848e-06, "loss": 0.85743231, "num_input_tokens_seen": 82102950, "step": 3816, "time_per_iteration": 2.702638626098633 }, { "auxiliary_loss_clip": 0.01142001, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.05649173, "balance_loss_mlp": 1.01952624, "epoch": 0.22949045543363897, "flos": 17238199829760.0, "grad_norm": 2.261586370580253, "language_loss": 0.8657164, "learning_rate": 3.596855544646742e-06, "loss": 0.88750786, "num_input_tokens_seen": 82119510, "step": 3817, "time_per_iteration": 2.6439061164855957 }, { "auxiliary_loss_clip": 0.01125222, "auxiliary_loss_mlp": 0.01048919, "balance_loss_clip": 1.0493896, "balance_loss_mlp": 1.03166902, "epoch": 0.22955057868630693, "flos": 27489438858240.0, "grad_norm": 3.8274774650765706, "language_loss": 0.74976468, "learning_rate": 3.5966210231537154e-06, "loss": 0.77150607, "num_input_tokens_seen": 82140095, "step": 3818, "time_per_iteration": 2.7610766887664795 }, { "auxiliary_loss_clip": 0.01146421, "auxiliary_loss_mlp": 0.01043004, "balance_loss_clip": 1.05866313, "balance_loss_mlp": 1.02550387, "epoch": 0.2296107019389749, "flos": 23476421629440.0, "grad_norm": 1.7490504114150227, "language_loss": 0.74682397, "learning_rate": 3.596386441116659e-06, "loss": 0.76871818, "num_input_tokens_seen": 82159510, "step": 3819, "time_per_iteration": 2.7125203609466553 }, { "auxiliary_loss_clip": 0.0114108, "auxiliary_loss_mlp": 0.0104377, "balance_loss_clip": 1.05479693, "balance_loss_mlp": 1.02630615, "epoch": 0.22967082519164286, "flos": 31285160760960.0, "grad_norm": 2.0230347194773732, "language_loss": 0.81103987, "learning_rate": 3.5961517985444684e-06, "loss": 0.83288836, "num_input_tokens_seen": 82179580, "step": 3820, "time_per_iteration": 2.7268714904785156 }, { "auxiliary_loss_clip": 0.01129285, "auxiliary_loss_mlp": 0.01044606, "balance_loss_clip": 1.05326903, "balance_loss_mlp": 1.02627158, "epoch": 0.22973094844431083, "flos": 14642935390080.0, "grad_norm": 2.2801321869619153, "language_loss": 0.69099033, "learning_rate": 3.595917095446042e-06, "loss": 0.71272922, "num_input_tokens_seen": 82195585, "step": 3821, "time_per_iteration": 2.659498691558838 }, { "auxiliary_loss_clip": 0.01098739, "auxiliary_loss_mlp": 0.01036962, "balance_loss_clip": 1.05118072, "balance_loss_mlp": 1.01888967, "epoch": 0.2297910716969788, "flos": 22823853292800.0, "grad_norm": 1.473505926288008, "language_loss": 0.82876307, "learning_rate": 3.5956823318302796e-06, "loss": 0.85012007, "num_input_tokens_seen": 82217530, "step": 3822, "time_per_iteration": 2.898287057876587 }, { "auxiliary_loss_clip": 0.01149833, "auxiliary_loss_mlp": 0.01044764, "balance_loss_clip": 1.05239797, "balance_loss_mlp": 1.02617884, "epoch": 0.2298511949496468, "flos": 23039029716480.0, "grad_norm": 2.077495396622281, "language_loss": 0.66552204, "learning_rate": 3.5954475077060833e-06, "loss": 0.68746805, "num_input_tokens_seen": 82237980, "step": 3823, "time_per_iteration": 2.6397016048431396 }, { "auxiliary_loss_clip": 0.01064018, "auxiliary_loss_mlp": 0.01005373, "balance_loss_clip": 1.04052305, "balance_loss_mlp": 1.00196409, "epoch": 0.22991131820231475, "flos": 66890914911360.0, "grad_norm": 0.8015900374762405, "language_loss": 0.56731141, "learning_rate": 3.595212623082357e-06, "loss": 0.5880053, "num_input_tokens_seen": 82301785, "step": 3824, "time_per_iteration": 3.2301526069641113 }, { "auxiliary_loss_clip": 0.01123513, "auxiliary_loss_mlp": 0.01037782, "balance_loss_clip": 1.0506382, "balance_loss_mlp": 1.02098525, "epoch": 0.22997144145498272, "flos": 17887248633600.0, "grad_norm": 2.0770938093466995, "language_loss": 0.7301755, "learning_rate": 3.594977677968009e-06, "loss": 0.7517885, "num_input_tokens_seen": 82317355, "step": 3825, "time_per_iteration": 2.6161818504333496 }, { "auxiliary_loss_clip": 0.01147516, "auxiliary_loss_mlp": 0.01049665, "balance_loss_clip": 1.05828226, "balance_loss_mlp": 1.03119957, "epoch": 0.23003156470765068, "flos": 24676843178880.0, "grad_norm": 1.8689845885894332, "language_loss": 0.87652314, "learning_rate": 3.5947426723719473e-06, "loss": 0.89849496, "num_input_tokens_seen": 82336645, "step": 3826, "time_per_iteration": 2.668858766555786 }, { "auxiliary_loss_clip": 0.01134406, "auxiliary_loss_mlp": 0.01045536, "balance_loss_clip": 1.05722022, "balance_loss_mlp": 1.02697468, "epoch": 0.23009168796031865, "flos": 15814126247040.0, "grad_norm": 2.4660324215504312, "language_loss": 0.81861693, "learning_rate": 3.594507606303083e-06, "loss": 0.84041631, "num_input_tokens_seen": 82354225, "step": 3827, "time_per_iteration": 2.67173171043396 }, { "auxiliary_loss_clip": 0.01083629, "auxiliary_loss_mlp": 0.01046658, "balance_loss_clip": 1.04976189, "balance_loss_mlp": 1.02728689, "epoch": 0.2301518112129866, "flos": 16212842190720.0, "grad_norm": 1.9417227311694012, "language_loss": 0.86676306, "learning_rate": 3.5942724797703314e-06, "loss": 0.88806593, "num_input_tokens_seen": 82370240, "step": 3828, "time_per_iteration": 2.7641990184783936 }, { "auxiliary_loss_clip": 0.01126786, "auxiliary_loss_mlp": 0.01048261, "balance_loss_clip": 1.05381465, "balance_loss_mlp": 1.02981901, "epoch": 0.2302119344656546, "flos": 20595452411520.0, "grad_norm": 2.6386744924703223, "language_loss": 0.7044189, "learning_rate": 3.594037292782607e-06, "loss": 0.72616941, "num_input_tokens_seen": 82389145, "step": 3829, "time_per_iteration": 2.6674952507019043 }, { "auxiliary_loss_clip": 0.01085573, "auxiliary_loss_mlp": 0.01045126, "balance_loss_clip": 1.04650855, "balance_loss_mlp": 1.02835345, "epoch": 0.23027205771832257, "flos": 26796901662720.0, "grad_norm": 1.6431866637768902, "language_loss": 0.84075069, "learning_rate": 3.5938020453488293e-06, "loss": 0.86205769, "num_input_tokens_seen": 82409185, "step": 3830, "time_per_iteration": 2.8631880283355713 }, { "auxiliary_loss_clip": 0.01132962, "auxiliary_loss_mlp": 0.01052116, "balance_loss_clip": 1.0506047, "balance_loss_mlp": 1.03415167, "epoch": 0.23033218097099054, "flos": 43873143068160.0, "grad_norm": 2.3429509345019213, "language_loss": 0.67036134, "learning_rate": 3.5935667374779177e-06, "loss": 0.6922121, "num_input_tokens_seen": 82432070, "step": 3831, "time_per_iteration": 2.91282320022583 }, { "auxiliary_loss_clip": 0.0111204, "auxiliary_loss_mlp": 0.01053367, "balance_loss_clip": 1.05277622, "balance_loss_mlp": 1.03496158, "epoch": 0.2303923042236585, "flos": 26067663745920.0, "grad_norm": 2.3469890931023194, "language_loss": 0.75711727, "learning_rate": 3.5933313691787957e-06, "loss": 0.7787714, "num_input_tokens_seen": 82450625, "step": 3832, "time_per_iteration": 2.759467601776123 }, { "auxiliary_loss_clip": 0.0110298, "auxiliary_loss_mlp": 0.01044565, "balance_loss_clip": 1.05044174, "balance_loss_mlp": 1.02596867, "epoch": 0.23045242747632647, "flos": 18296379521280.0, "grad_norm": 1.7769817461106177, "language_loss": 0.87558299, "learning_rate": 3.593095940460389e-06, "loss": 0.89705843, "num_input_tokens_seen": 82468575, "step": 3833, "time_per_iteration": 2.8548035621643066 }, { "auxiliary_loss_clip": 0.01116173, "auxiliary_loss_mlp": 0.01046082, "balance_loss_clip": 1.05032015, "balance_loss_mlp": 1.02814126, "epoch": 0.23051255072899443, "flos": 25520528805120.0, "grad_norm": 2.030934473686878, "language_loss": 0.74736786, "learning_rate": 3.592860451331624e-06, "loss": 0.7689904, "num_input_tokens_seen": 82488655, "step": 3834, "time_per_iteration": 2.719237804412842 }, { "auxiliary_loss_clip": 0.01104525, "auxiliary_loss_mlp": 0.01064338, "balance_loss_clip": 1.04610491, "balance_loss_mlp": 1.043679, "epoch": 0.2305726739816624, "flos": 21215198695680.0, "grad_norm": 1.9050082770497696, "language_loss": 0.86071098, "learning_rate": 3.592624901801432e-06, "loss": 0.88239956, "num_input_tokens_seen": 82507220, "step": 3835, "time_per_iteration": 2.627782106399536 }, { "auxiliary_loss_clip": 0.01115977, "auxiliary_loss_mlp": 0.01060727, "balance_loss_clip": 1.04934275, "balance_loss_mlp": 1.03979373, "epoch": 0.2306327972343304, "flos": 23331127115520.0, "grad_norm": 2.798777841757382, "language_loss": 0.82434011, "learning_rate": 3.5923892918787432e-06, "loss": 0.84610713, "num_input_tokens_seen": 82527920, "step": 3836, "time_per_iteration": 2.6091606616973877 }, { "auxiliary_loss_clip": 0.01144536, "auxiliary_loss_mlp": 0.0105466, "balance_loss_clip": 1.06090033, "balance_loss_mlp": 1.03683817, "epoch": 0.23069292048699835, "flos": 20666734951680.0, "grad_norm": 1.7189193248017045, "language_loss": 0.79633009, "learning_rate": 3.5921536215724934e-06, "loss": 0.81832206, "num_input_tokens_seen": 82549040, "step": 3837, "time_per_iteration": 2.535435914993286 }, { "auxiliary_loss_clip": 0.01057695, "auxiliary_loss_mlp": 0.01033541, "balance_loss_clip": 1.04840386, "balance_loss_mlp": 1.03003633, "epoch": 0.23075304373966632, "flos": 70454832393600.0, "grad_norm": 0.9031703200773207, "language_loss": 0.65381849, "learning_rate": 3.5919178908916184e-06, "loss": 0.67473078, "num_input_tokens_seen": 82604070, "step": 3838, "time_per_iteration": 3.0868518352508545 }, { "auxiliary_loss_clip": 0.01138177, "auxiliary_loss_mlp": 0.01056497, "balance_loss_clip": 1.05361629, "balance_loss_mlp": 1.0395453, "epoch": 0.23081316699233428, "flos": 16617986668800.0, "grad_norm": 2.5143705705619097, "language_loss": 0.75403488, "learning_rate": 3.591682099845058e-06, "loss": 0.77598161, "num_input_tokens_seen": 82619665, "step": 3839, "time_per_iteration": 2.6391067504882812 }, { "auxiliary_loss_clip": 0.01125705, "auxiliary_loss_mlp": 0.01046933, "balance_loss_clip": 1.05447173, "balance_loss_mlp": 1.02882481, "epoch": 0.23087329024500225, "flos": 13298081253120.0, "grad_norm": 1.8684605740856612, "language_loss": 0.68962026, "learning_rate": 3.591446248441752e-06, "loss": 0.71134663, "num_input_tokens_seen": 82637530, "step": 3840, "time_per_iteration": 2.6295006275177 }, { "auxiliary_loss_clip": 0.01158019, "auxiliary_loss_mlp": 0.01046048, "balance_loss_clip": 1.05840647, "balance_loss_mlp": 1.026057, "epoch": 0.23093341349767021, "flos": 17785729820160.0, "grad_norm": 2.5615469809997697, "language_loss": 0.80033958, "learning_rate": 3.591210336690645e-06, "loss": 0.8223803, "num_input_tokens_seen": 82656130, "step": 3841, "time_per_iteration": 2.6512410640716553 }, { "auxiliary_loss_clip": 0.01145317, "auxiliary_loss_mlp": 0.01047066, "balance_loss_clip": 1.05756617, "balance_loss_mlp": 1.0301621, "epoch": 0.23099353675033818, "flos": 23988076911360.0, "grad_norm": 1.7953422744525294, "language_loss": 0.83389241, "learning_rate": 3.590974364600683e-06, "loss": 0.85581625, "num_input_tokens_seen": 82675295, "step": 3842, "time_per_iteration": 2.7676117420196533 }, { "auxiliary_loss_clip": 0.01144752, "auxiliary_loss_mlp": 0.01044783, "balance_loss_clip": 1.05491304, "balance_loss_mlp": 1.02650845, "epoch": 0.23105366000300617, "flos": 35995168471680.0, "grad_norm": 1.8421697704365976, "language_loss": 0.66661239, "learning_rate": 3.5907383321808135e-06, "loss": 0.68850774, "num_input_tokens_seen": 82703260, "step": 3843, "time_per_iteration": 5.82958722114563 }, { "auxiliary_loss_clip": 0.01142299, "auxiliary_loss_mlp": 0.01047166, "balance_loss_clip": 1.05609, "balance_loss_mlp": 1.02914143, "epoch": 0.23111378325567414, "flos": 31245335556480.0, "grad_norm": 1.8996188882256444, "language_loss": 0.77221334, "learning_rate": 3.590502239439987e-06, "loss": 0.79410803, "num_input_tokens_seen": 82725060, "step": 3844, "time_per_iteration": 2.771226406097412 }, { "auxiliary_loss_clip": 0.01141796, "auxiliary_loss_mlp": 0.01045598, "balance_loss_clip": 1.05503309, "balance_loss_mlp": 1.02607179, "epoch": 0.2311739065083421, "flos": 19208223204480.0, "grad_norm": 1.9651801579729304, "language_loss": 0.78155982, "learning_rate": 3.590266086387156e-06, "loss": 0.80343372, "num_input_tokens_seen": 82742960, "step": 3845, "time_per_iteration": 4.247429370880127 }, { "auxiliary_loss_clip": 0.01117167, "auxiliary_loss_mlp": 0.01039426, "balance_loss_clip": 1.05274439, "balance_loss_mlp": 1.02292788, "epoch": 0.23123402976101007, "flos": 23360178240000.0, "grad_norm": 2.083958857623256, "language_loss": 0.76397669, "learning_rate": 3.590029873031276e-06, "loss": 0.78554261, "num_input_tokens_seen": 82760205, "step": 3846, "time_per_iteration": 2.7805917263031006 }, { "auxiliary_loss_clip": 0.01131462, "auxiliary_loss_mlp": 0.01049247, "balance_loss_clip": 1.05376291, "balance_loss_mlp": 1.03193808, "epoch": 0.23129415301367803, "flos": 13735365425280.0, "grad_norm": 1.8827740097117207, "language_loss": 0.70281041, "learning_rate": 3.589793599381304e-06, "loss": 0.72461748, "num_input_tokens_seen": 82778590, "step": 3847, "time_per_iteration": 2.6848642826080322 }, { "auxiliary_loss_clip": 0.01065475, "auxiliary_loss_mlp": 0.01006045, "balance_loss_clip": 1.04309821, "balance_loss_mlp": 1.00356507, "epoch": 0.231354276266346, "flos": 69737015001600.0, "grad_norm": 0.7955227467680892, "language_loss": 0.61006129, "learning_rate": 3.589557265446198e-06, "loss": 0.63077646, "num_input_tokens_seen": 82833925, "step": 3848, "time_per_iteration": 3.08832049369812 }, { "auxiliary_loss_clip": 0.01142916, "auxiliary_loss_mlp": 0.01044943, "balance_loss_clip": 1.05631924, "balance_loss_mlp": 1.02640557, "epoch": 0.231414399519014, "flos": 18835900778880.0, "grad_norm": 1.9602331138800266, "language_loss": 0.78082883, "learning_rate": 3.589320871234923e-06, "loss": 0.80270743, "num_input_tokens_seen": 82850625, "step": 3849, "time_per_iteration": 2.6830787658691406 }, { "auxiliary_loss_clip": 0.01137959, "auxiliary_loss_mlp": 0.01044864, "balance_loss_clip": 1.05184579, "balance_loss_mlp": 1.02630353, "epoch": 0.23147452277168196, "flos": 36135470995200.0, "grad_norm": 2.354271482082729, "language_loss": 0.71243513, "learning_rate": 3.5890844167564405e-06, "loss": 0.7342633, "num_input_tokens_seen": 82872105, "step": 3850, "time_per_iteration": 4.467762231826782 }, { "auxiliary_loss_clip": 0.01121609, "auxiliary_loss_mlp": 0.00776401, "balance_loss_clip": 1.05099773, "balance_loss_mlp": 1.00153255, "epoch": 0.23153464602434992, "flos": 20812927305600.0, "grad_norm": 4.184777043510671, "language_loss": 0.76577097, "learning_rate": 3.588847902019718e-06, "loss": 0.78475106, "num_input_tokens_seen": 82890595, "step": 3851, "time_per_iteration": 2.7452898025512695 }, { "auxiliary_loss_clip": 0.01152703, "auxiliary_loss_mlp": 0.01038649, "balance_loss_clip": 1.05650854, "balance_loss_mlp": 1.0206244, "epoch": 0.2315947692770179, "flos": 19939256801280.0, "grad_norm": 2.0528428588063914, "language_loss": 0.69642782, "learning_rate": 3.588611327033723e-06, "loss": 0.71834141, "num_input_tokens_seen": 82908910, "step": 3852, "time_per_iteration": 2.613687038421631 }, { "auxiliary_loss_clip": 0.0110964, "auxiliary_loss_mlp": 0.01050002, "balance_loss_clip": 1.05097961, "balance_loss_mlp": 1.0303328, "epoch": 0.23165489252968585, "flos": 12855553695360.0, "grad_norm": 2.8596642791724993, "language_loss": 0.67063856, "learning_rate": 3.588374691807428e-06, "loss": 0.69223493, "num_input_tokens_seen": 82925405, "step": 3853, "time_per_iteration": 2.6974282264709473 }, { "auxiliary_loss_clip": 0.01146149, "auxiliary_loss_mlp": 0.01041525, "balance_loss_clip": 1.05749798, "balance_loss_mlp": 1.02340484, "epoch": 0.23171501578235382, "flos": 30628282792320.0, "grad_norm": 1.7603397459637538, "language_loss": 0.80139267, "learning_rate": 3.5881379963498053e-06, "loss": 0.82326943, "num_input_tokens_seen": 82945615, "step": 3854, "time_per_iteration": 2.712125062942505 }, { "auxiliary_loss_clip": 0.01115767, "auxiliary_loss_mlp": 0.01052387, "balance_loss_clip": 1.04737794, "balance_loss_mlp": 1.03070331, "epoch": 0.23177513903502178, "flos": 23842782397440.0, "grad_norm": 1.9709775740629982, "language_loss": 0.65103847, "learning_rate": 3.587901240669831e-06, "loss": 0.67272007, "num_input_tokens_seen": 82967570, "step": 3855, "time_per_iteration": 2.718756675720215 }, { "auxiliary_loss_clip": 0.01153506, "auxiliary_loss_mlp": 0.01048508, "balance_loss_clip": 1.05417824, "balance_loss_mlp": 1.03050709, "epoch": 0.23183526228768978, "flos": 29570282668800.0, "grad_norm": 1.7803112411977504, "language_loss": 0.70386064, "learning_rate": 3.5876644247764815e-06, "loss": 0.7258808, "num_input_tokens_seen": 82987435, "step": 3856, "time_per_iteration": 2.798675060272217 }, { "auxiliary_loss_clip": 0.01103018, "auxiliary_loss_mlp": 0.01035927, "balance_loss_clip": 1.05080032, "balance_loss_mlp": 1.0200007, "epoch": 0.23189538554035774, "flos": 34458694254720.0, "grad_norm": 1.7837780829213195, "language_loss": 0.77101243, "learning_rate": 3.5874275486787387e-06, "loss": 0.79240191, "num_input_tokens_seen": 83010505, "step": 3857, "time_per_iteration": 2.8545501232147217 }, { "auxiliary_loss_clip": 0.01136868, "auxiliary_loss_mlp": 0.00777317, "balance_loss_clip": 1.0528996, "balance_loss_mlp": 1.00133562, "epoch": 0.2319555087930257, "flos": 18003815245440.0, "grad_norm": 2.445609387195472, "language_loss": 0.91629225, "learning_rate": 3.587190612385584e-06, "loss": 0.9354341, "num_input_tokens_seen": 83026705, "step": 3858, "time_per_iteration": 2.7018845081329346 }, { "auxiliary_loss_clip": 0.01095626, "auxiliary_loss_mlp": 0.01043975, "balance_loss_clip": 1.04882586, "balance_loss_mlp": 1.0263319, "epoch": 0.23201563204569367, "flos": 23143852581120.0, "grad_norm": 1.987074492721614, "language_loss": 0.76833785, "learning_rate": 3.5869536159060026e-06, "loss": 0.78973383, "num_input_tokens_seen": 83046500, "step": 3859, "time_per_iteration": 2.7465155124664307 }, { "auxiliary_loss_clip": 0.01136816, "auxiliary_loss_mlp": 0.01041128, "balance_loss_clip": 1.05060959, "balance_loss_mlp": 1.02316284, "epoch": 0.23207575529836164, "flos": 20667991927680.0, "grad_norm": 1.7166447387893018, "language_loss": 0.84341264, "learning_rate": 3.58671655924898e-06, "loss": 0.86519206, "num_input_tokens_seen": 83065280, "step": 3860, "time_per_iteration": 2.6602063179016113 }, { "auxiliary_loss_clip": 0.01091436, "auxiliary_loss_mlp": 0.01044571, "balance_loss_clip": 1.04641938, "balance_loss_mlp": 1.02640343, "epoch": 0.2321358785510296, "flos": 16472189364480.0, "grad_norm": 2.014536853896284, "language_loss": 0.83431923, "learning_rate": 3.586479442423508e-06, "loss": 0.85567933, "num_input_tokens_seen": 83082310, "step": 3861, "time_per_iteration": 2.728750228881836 }, { "auxiliary_loss_clip": 0.01130655, "auxiliary_loss_mlp": 0.00776368, "balance_loss_clip": 1.05122983, "balance_loss_mlp": 1.00149858, "epoch": 0.2321960018036976, "flos": 21616320850560.0, "grad_norm": 1.8874922149770945, "language_loss": 0.85921204, "learning_rate": 3.586242265438576e-06, "loss": 0.87828225, "num_input_tokens_seen": 83102065, "step": 3862, "time_per_iteration": 2.7289161682128906 }, { "auxiliary_loss_clip": 0.01112788, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04956031, "balance_loss_mlp": 1.02645802, "epoch": 0.23225612505636556, "flos": 22271474966400.0, "grad_norm": 1.4078274786009342, "language_loss": 0.75131166, "learning_rate": 3.5860050283031773e-06, "loss": 0.77286315, "num_input_tokens_seen": 83121445, "step": 3863, "time_per_iteration": 2.7308037281036377 }, { "auxiliary_loss_clip": 0.01109911, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.05320251, "balance_loss_mlp": 1.02840066, "epoch": 0.23231624830903352, "flos": 17052325925760.0, "grad_norm": 1.8195520841096788, "language_loss": 0.74952984, "learning_rate": 3.58576773102631e-06, "loss": 0.77107918, "num_input_tokens_seen": 83138175, "step": 3864, "time_per_iteration": 2.669403314590454 }, { "auxiliary_loss_clip": 0.01148697, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.05258274, "balance_loss_mlp": 1.02182317, "epoch": 0.2323763715617015, "flos": 34640043045120.0, "grad_norm": 1.757817857347048, "language_loss": 0.70438093, "learning_rate": 3.5855303736169714e-06, "loss": 0.72625393, "num_input_tokens_seen": 83161975, "step": 3865, "time_per_iteration": 2.766399621963501 }, { "auxiliary_loss_clip": 0.01156124, "auxiliary_loss_mlp": 0.01048904, "balance_loss_clip": 1.05352104, "balance_loss_mlp": 1.02978325, "epoch": 0.23243649481436945, "flos": 25551698832000.0, "grad_norm": 1.8965816841290546, "language_loss": 0.94702542, "learning_rate": 3.5852929560841617e-06, "loss": 0.96907574, "num_input_tokens_seen": 83180905, "step": 3866, "time_per_iteration": 2.659867525100708 }, { "auxiliary_loss_clip": 0.01131283, "auxiliary_loss_mlp": 0.01044032, "balance_loss_clip": 1.04904807, "balance_loss_mlp": 1.02683008, "epoch": 0.23249661806703742, "flos": 20483482740480.0, "grad_norm": 4.181849364953483, "language_loss": 0.73026884, "learning_rate": 3.5850554784368846e-06, "loss": 0.75202191, "num_input_tokens_seen": 83196390, "step": 3867, "time_per_iteration": 2.645481586456299 }, { "auxiliary_loss_clip": 0.0112954, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.05079126, "balance_loss_mlp": 1.02855754, "epoch": 0.23255674131970538, "flos": 20376612800640.0, "grad_norm": 1.9671041323983256, "language_loss": 0.82770872, "learning_rate": 3.584817940684145e-06, "loss": 0.84946775, "num_input_tokens_seen": 83216165, "step": 3868, "time_per_iteration": 2.7670326232910156 }, { "auxiliary_loss_clip": 0.01125563, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.04875207, "balance_loss_mlp": 1.02648687, "epoch": 0.23261686457237338, "flos": 17056096853760.0, "grad_norm": 2.1100994183362967, "language_loss": 0.72952414, "learning_rate": 3.58458034283495e-06, "loss": 0.75121534, "num_input_tokens_seen": 83233845, "step": 3869, "time_per_iteration": 2.6661763191223145 }, { "auxiliary_loss_clip": 0.01132223, "auxiliary_loss_mlp": 0.0105087, "balance_loss_clip": 1.05129242, "balance_loss_mlp": 1.03382349, "epoch": 0.23267698782504134, "flos": 29169878785920.0, "grad_norm": 2.500604422715561, "language_loss": 0.79142725, "learning_rate": 3.5843426848983097e-06, "loss": 0.81325811, "num_input_tokens_seen": 83254930, "step": 3870, "time_per_iteration": 2.707321882247925 }, { "auxiliary_loss_clip": 0.01152434, "auxiliary_loss_mlp": 0.01046711, "balance_loss_clip": 1.05334866, "balance_loss_mlp": 1.02924728, "epoch": 0.2327371110777093, "flos": 21174655219200.0, "grad_norm": 2.176894576680098, "language_loss": 0.70915782, "learning_rate": 3.5841049668832357e-06, "loss": 0.73114932, "num_input_tokens_seen": 83272095, "step": 3871, "time_per_iteration": 2.6389646530151367 }, { "auxiliary_loss_clip": 0.01139847, "auxiliary_loss_mlp": 0.01051541, "balance_loss_clip": 1.05543458, "balance_loss_mlp": 1.03244328, "epoch": 0.23279723433037727, "flos": 24863112132480.0, "grad_norm": 1.8306984701748774, "language_loss": 0.68877381, "learning_rate": 3.5838671887987433e-06, "loss": 0.71068764, "num_input_tokens_seen": 83290980, "step": 3872, "time_per_iteration": 2.662309408187866 }, { "auxiliary_loss_clip": 0.0114472, "auxiliary_loss_mlp": 0.01042459, "balance_loss_clip": 1.05313611, "balance_loss_mlp": 1.02388597, "epoch": 0.23285735758304524, "flos": 38800617344640.0, "grad_norm": 1.5710106481349988, "language_loss": 0.779724, "learning_rate": 3.5836293506538474e-06, "loss": 0.80159569, "num_input_tokens_seen": 83315175, "step": 3873, "time_per_iteration": 2.884542942047119 }, { "auxiliary_loss_clip": 0.01053683, "auxiliary_loss_mlp": 0.01022765, "balance_loss_clip": 1.03691578, "balance_loss_mlp": 1.02038097, "epoch": 0.2329174808357132, "flos": 53944113692160.0, "grad_norm": 0.8561383552409444, "language_loss": 0.6051712, "learning_rate": 3.5833914524575687e-06, "loss": 0.62593567, "num_input_tokens_seen": 83372060, "step": 3874, "time_per_iteration": 3.165809392929077 }, { "auxiliary_loss_clip": 0.0112779, "auxiliary_loss_mlp": 0.01040869, "balance_loss_clip": 1.05157447, "balance_loss_mlp": 1.02328515, "epoch": 0.23297760408838117, "flos": 21216024708480.0, "grad_norm": 2.5039775977564522, "language_loss": 0.80842507, "learning_rate": 3.583153494218927e-06, "loss": 0.83011162, "num_input_tokens_seen": 83389795, "step": 3875, "time_per_iteration": 2.673657178878784 }, { "auxiliary_loss_clip": 0.01147803, "auxiliary_loss_mlp": 0.00774568, "balance_loss_clip": 1.05367982, "balance_loss_mlp": 1.00145388, "epoch": 0.23303772734104916, "flos": 28403006394240.0, "grad_norm": 4.3174446976030465, "language_loss": 0.6123395, "learning_rate": 3.5829154759469464e-06, "loss": 0.63156319, "num_input_tokens_seen": 83410005, "step": 3876, "time_per_iteration": 2.6973021030426025 }, { "auxiliary_loss_clip": 0.01116571, "auxiliary_loss_mlp": 0.01051971, "balance_loss_clip": 1.05002618, "balance_loss_mlp": 1.03345811, "epoch": 0.23309785059371713, "flos": 24314720215680.0, "grad_norm": 2.4263361529850447, "language_loss": 0.70649457, "learning_rate": 3.5826773976506523e-06, "loss": 0.72817999, "num_input_tokens_seen": 83430250, "step": 3877, "time_per_iteration": 2.7506351470947266 }, { "auxiliary_loss_clip": 0.01143537, "auxiliary_loss_mlp": 0.01051311, "balance_loss_clip": 1.05495286, "balance_loss_mlp": 1.03245187, "epoch": 0.2331579738463851, "flos": 15992925171840.0, "grad_norm": 2.202899784913125, "language_loss": 0.80724835, "learning_rate": 3.582439259339073e-06, "loss": 0.82919687, "num_input_tokens_seen": 83447950, "step": 3878, "time_per_iteration": 2.6945395469665527 }, { "auxiliary_loss_clip": 0.0109123, "auxiliary_loss_mlp": 0.01049547, "balance_loss_clip": 1.04632592, "balance_loss_mlp": 1.0298301, "epoch": 0.23321809709905306, "flos": 36426957863040.0, "grad_norm": 1.857420507716431, "language_loss": 0.7521472, "learning_rate": 3.5822010610212374e-06, "loss": 0.77355498, "num_input_tokens_seen": 83467785, "step": 3879, "time_per_iteration": 2.8909342288970947 }, { "auxiliary_loss_clip": 0.01095967, "auxiliary_loss_mlp": 0.01051433, "balance_loss_clip": 1.04621899, "balance_loss_mlp": 1.03238297, "epoch": 0.23327822035172102, "flos": 21324762155520.0, "grad_norm": 2.179587653719585, "language_loss": 0.89532614, "learning_rate": 3.5819628027061795e-06, "loss": 0.91680014, "num_input_tokens_seen": 83485390, "step": 3880, "time_per_iteration": 2.7358896732330322 }, { "auxiliary_loss_clip": 0.01127816, "auxiliary_loss_mlp": 0.01049697, "balance_loss_clip": 1.05119944, "balance_loss_mlp": 1.0319109, "epoch": 0.233338343604389, "flos": 19171881619200.0, "grad_norm": 1.6825190155617658, "language_loss": 0.71915156, "learning_rate": 3.5817244844029334e-06, "loss": 0.74092674, "num_input_tokens_seen": 83504890, "step": 3881, "time_per_iteration": 2.702533721923828 }, { "auxiliary_loss_clip": 0.01148084, "auxiliary_loss_mlp": 0.0104282, "balance_loss_clip": 1.05186546, "balance_loss_mlp": 1.02497458, "epoch": 0.23339846685705698, "flos": 26908368543360.0, "grad_norm": 1.5464986217430505, "language_loss": 0.68210357, "learning_rate": 3.581486106120537e-06, "loss": 0.70401263, "num_input_tokens_seen": 83526475, "step": 3882, "time_per_iteration": 2.6449384689331055 }, { "auxiliary_loss_clip": 0.01106984, "auxiliary_loss_mlp": 0.01053219, "balance_loss_clip": 1.04567862, "balance_loss_mlp": 1.03457499, "epoch": 0.23345859010972494, "flos": 32343160884480.0, "grad_norm": 2.180831821464153, "language_loss": 0.77379489, "learning_rate": 3.5812476678680287e-06, "loss": 0.79539698, "num_input_tokens_seen": 83546620, "step": 3883, "time_per_iteration": 5.806958913803101 }, { "auxiliary_loss_clip": 0.01053192, "auxiliary_loss_mlp": 0.01007679, "balance_loss_clip": 1.03368068, "balance_loss_mlp": 1.0053544, "epoch": 0.2335187133623929, "flos": 58484229050880.0, "grad_norm": 0.7945750769740417, "language_loss": 0.59117424, "learning_rate": 3.58100916965445e-06, "loss": 0.61178291, "num_input_tokens_seen": 83616160, "step": 3884, "time_per_iteration": 3.3524324893951416 }, { "auxiliary_loss_clip": 0.01117007, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.04925692, "balance_loss_mlp": 1.01704168, "epoch": 0.23357883661506088, "flos": 24502317972480.0, "grad_norm": 1.6775563031527567, "language_loss": 0.80286831, "learning_rate": 3.5807706114888455e-06, "loss": 0.82437843, "num_input_tokens_seen": 83636795, "step": 3885, "time_per_iteration": 4.295818328857422 }, { "auxiliary_loss_clip": 0.01136024, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05494285, "balance_loss_mlp": 1.02274597, "epoch": 0.23363895986772884, "flos": 18948516894720.0, "grad_norm": 2.2066793657203116, "language_loss": 0.88230193, "learning_rate": 3.580531993380261e-06, "loss": 0.90406859, "num_input_tokens_seen": 83654050, "step": 3886, "time_per_iteration": 2.6672091484069824 }, { "auxiliary_loss_clip": 0.01150675, "auxiliary_loss_mlp": 0.01042457, "balance_loss_clip": 1.05293703, "balance_loss_mlp": 1.02512443, "epoch": 0.2336990831203968, "flos": 31686821619840.0, "grad_norm": 4.0082984179074055, "language_loss": 0.73170543, "learning_rate": 3.5802933153377445e-06, "loss": 0.75363672, "num_input_tokens_seen": 83673720, "step": 3887, "time_per_iteration": 2.7338294982910156 }, { "auxiliary_loss_clip": 0.01140271, "auxiliary_loss_mlp": 0.0104923, "balance_loss_clip": 1.05201173, "balance_loss_mlp": 1.03183722, "epoch": 0.23375920637306477, "flos": 27709750926720.0, "grad_norm": 2.677865426107907, "language_loss": 0.84125429, "learning_rate": 3.5800545773703475e-06, "loss": 0.86314929, "num_input_tokens_seen": 83693470, "step": 3888, "time_per_iteration": 2.7020208835601807 }, { "auxiliary_loss_clip": 0.01121847, "auxiliary_loss_mlp": 0.010605, "balance_loss_clip": 1.04974008, "balance_loss_mlp": 1.04121208, "epoch": 0.23381932962573276, "flos": 17675627656320.0, "grad_norm": 3.2074942430893976, "language_loss": 0.87298381, "learning_rate": 3.5798157794871225e-06, "loss": 0.89480728, "num_input_tokens_seen": 83711620, "step": 3889, "time_per_iteration": 4.319674491882324 }, { "auxiliary_loss_clip": 0.01141703, "auxiliary_loss_mlp": 0.01046248, "balance_loss_clip": 1.05330396, "balance_loss_mlp": 1.02877164, "epoch": 0.23387945287840073, "flos": 14390842763520.0, "grad_norm": 3.8719217250511164, "language_loss": 0.76830876, "learning_rate": 3.579576921697125e-06, "loss": 0.79018819, "num_input_tokens_seen": 83727890, "step": 3890, "time_per_iteration": 2.6133198738098145 }, { "auxiliary_loss_clip": 0.01107139, "auxiliary_loss_mlp": 0.00775386, "balance_loss_clip": 1.04837406, "balance_loss_mlp": 1.00124502, "epoch": 0.2339395761310687, "flos": 46097988503040.0, "grad_norm": 1.8304579433009527, "language_loss": 0.73385048, "learning_rate": 3.579338004009412e-06, "loss": 0.75267571, "num_input_tokens_seen": 83749370, "step": 3891, "time_per_iteration": 3.008927583694458 }, { "auxiliary_loss_clip": 0.01145053, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.05121398, "balance_loss_mlp": 1.03035665, "epoch": 0.23399969938373666, "flos": 22382044007040.0, "grad_norm": 1.8316289897122906, "language_loss": 0.82725632, "learning_rate": 3.5790990264330433e-06, "loss": 0.84918392, "num_input_tokens_seen": 83769560, "step": 3892, "time_per_iteration": 2.6455893516540527 }, { "auxiliary_loss_clip": 0.01100914, "auxiliary_loss_mlp": 0.01055558, "balance_loss_clip": 1.04450488, "balance_loss_mlp": 1.03491104, "epoch": 0.23405982263640462, "flos": 43508542066560.0, "grad_norm": 2.707564715226966, "language_loss": 0.64982933, "learning_rate": 3.578859988977082e-06, "loss": 0.67139405, "num_input_tokens_seen": 83795635, "step": 3893, "time_per_iteration": 2.9392964839935303 }, { "auxiliary_loss_clip": 0.01106007, "auxiliary_loss_mlp": 0.01045218, "balance_loss_clip": 1.04782617, "balance_loss_mlp": 1.02701449, "epoch": 0.2341199458890726, "flos": 22564685687040.0, "grad_norm": 2.5782091790717105, "language_loss": 0.79415286, "learning_rate": 3.5786208916505916e-06, "loss": 0.81566513, "num_input_tokens_seen": 83814090, "step": 3894, "time_per_iteration": 2.839935541152954 }, { "auxiliary_loss_clip": 0.01134295, "auxiliary_loss_mlp": 0.01049748, "balance_loss_clip": 1.04747164, "balance_loss_mlp": 1.03253388, "epoch": 0.23418006914174055, "flos": 25633970933760.0, "grad_norm": 1.551347830991082, "language_loss": 0.81978422, "learning_rate": 3.5783817344626383e-06, "loss": 0.84162462, "num_input_tokens_seen": 83836870, "step": 3895, "time_per_iteration": 2.739955425262451 }, { "auxiliary_loss_clip": 0.01134592, "auxiliary_loss_mlp": 0.01052429, "balance_loss_clip": 1.04999852, "balance_loss_mlp": 1.03514385, "epoch": 0.23424019239440855, "flos": 13545936074880.0, "grad_norm": 1.8690411936118732, "language_loss": 0.80239451, "learning_rate": 3.578142517422292e-06, "loss": 0.82426476, "num_input_tokens_seen": 83853275, "step": 3896, "time_per_iteration": 2.681114435195923 }, { "auxiliary_loss_clip": 0.01125586, "auxiliary_loss_mlp": 0.01045792, "balance_loss_clip": 1.04685259, "balance_loss_mlp": 1.02779162, "epoch": 0.2343003156470765, "flos": 22419498913920.0, "grad_norm": 2.2492510100498087, "language_loss": 0.83249009, "learning_rate": 3.577903240538623e-06, "loss": 0.85420382, "num_input_tokens_seen": 83872340, "step": 3897, "time_per_iteration": 2.728916645050049 }, { "auxiliary_loss_clip": 0.01134669, "auxiliary_loss_mlp": 0.01058403, "balance_loss_clip": 1.04949594, "balance_loss_mlp": 1.04016376, "epoch": 0.23436043889974448, "flos": 14790815683200.0, "grad_norm": 1.5875861860902294, "language_loss": 0.78903484, "learning_rate": 3.577663903820705e-06, "loss": 0.81096554, "num_input_tokens_seen": 83888795, "step": 3898, "time_per_iteration": 2.6597952842712402 }, { "auxiliary_loss_clip": 0.01109182, "auxiliary_loss_mlp": 0.01055226, "balance_loss_clip": 1.04657888, "balance_loss_mlp": 1.03785777, "epoch": 0.23442056215241244, "flos": 22965700101120.0, "grad_norm": 1.9975380770167093, "language_loss": 0.73769581, "learning_rate": 3.577424507277614e-06, "loss": 0.75933987, "num_input_tokens_seen": 83906820, "step": 3899, "time_per_iteration": 2.7511518001556396 }, { "auxiliary_loss_clip": 0.01110646, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.04662895, "balance_loss_mlp": 1.03530502, "epoch": 0.2344806854050804, "flos": 23071887682560.0, "grad_norm": 2.822835219305806, "language_loss": 0.75323856, "learning_rate": 3.5771850509184277e-06, "loss": 0.77488053, "num_input_tokens_seen": 83926370, "step": 3900, "time_per_iteration": 2.7366316318511963 }, { "auxiliary_loss_clip": 0.01097598, "auxiliary_loss_mlp": 0.01047935, "balance_loss_clip": 1.04771769, "balance_loss_mlp": 1.03019702, "epoch": 0.23454080865774837, "flos": 16327074418560.0, "grad_norm": 1.7042292639984586, "language_loss": 0.67123592, "learning_rate": 3.5769455347522256e-06, "loss": 0.69269133, "num_input_tokens_seen": 83944600, "step": 3901, "time_per_iteration": 2.857386589050293 }, { "auxiliary_loss_clip": 0.01029196, "auxiliary_loss_mlp": 0.01060621, "balance_loss_clip": 1.02959871, "balance_loss_mlp": 1.0584631, "epoch": 0.23460093191041637, "flos": 67760958142080.0, "grad_norm": 0.7708596717968548, "language_loss": 0.58189189, "learning_rate": 3.576705958788091e-06, "loss": 0.60279006, "num_input_tokens_seen": 84005100, "step": 3902, "time_per_iteration": 3.2769579887390137 }, { "auxiliary_loss_clip": 0.01126982, "auxiliary_loss_mlp": 0.01045748, "balance_loss_clip": 1.05044544, "balance_loss_mlp": 1.02691305, "epoch": 0.23466105516308433, "flos": 20077619990400.0, "grad_norm": 2.0309755154884708, "language_loss": 0.80396789, "learning_rate": 3.576466323035108e-06, "loss": 0.82569516, "num_input_tokens_seen": 84023775, "step": 3903, "time_per_iteration": 2.683908462524414 }, { "auxiliary_loss_clip": 0.01092072, "auxiliary_loss_mlp": 0.01044121, "balance_loss_clip": 1.04248238, "balance_loss_mlp": 1.02614391, "epoch": 0.2347211784157523, "flos": 24535714642560.0, "grad_norm": 1.970422818337997, "language_loss": 0.82400727, "learning_rate": 3.5762266275023645e-06, "loss": 0.84536922, "num_input_tokens_seen": 84042605, "step": 3904, "time_per_iteration": 2.8023037910461426 }, { "auxiliary_loss_clip": 0.01147463, "auxiliary_loss_mlp": 0.01043559, "balance_loss_clip": 1.05247784, "balance_loss_mlp": 1.02620173, "epoch": 0.23478130166842026, "flos": 23805040181760.0, "grad_norm": 1.9105311329606578, "language_loss": 0.71330345, "learning_rate": 3.57598687219895e-06, "loss": 0.73521364, "num_input_tokens_seen": 84061520, "step": 3905, "time_per_iteration": 2.650956869125366 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.05086017, "balance_loss_mlp": 1.01877677, "epoch": 0.23484142492108823, "flos": 24093618048000.0, "grad_norm": 2.334164983860831, "language_loss": 0.71415532, "learning_rate": 3.5757470571339543e-06, "loss": 0.73594707, "num_input_tokens_seen": 84081800, "step": 3906, "time_per_iteration": 2.6635055541992188 }, { "auxiliary_loss_clip": 0.01138147, "auxiliary_loss_mlp": 0.01042098, "balance_loss_clip": 1.04703832, "balance_loss_mlp": 1.02246392, "epoch": 0.2349015481737562, "flos": 29095830898560.0, "grad_norm": 2.5527171953873693, "language_loss": 0.74024308, "learning_rate": 3.575507182316473e-06, "loss": 0.7620455, "num_input_tokens_seen": 84102340, "step": 3907, "time_per_iteration": 2.751154661178589 }, { "auxiliary_loss_clip": 0.01135101, "auxiliary_loss_mlp": 0.01047433, "balance_loss_clip": 1.04911268, "balance_loss_mlp": 1.02950394, "epoch": 0.23496167142642416, "flos": 18916305373440.0, "grad_norm": 1.9847054585906883, "language_loss": 0.72428519, "learning_rate": 3.575267247755601e-06, "loss": 0.74611056, "num_input_tokens_seen": 84120370, "step": 3908, "time_per_iteration": 2.631162166595459 }, { "auxiliary_loss_clip": 0.01053013, "auxiliary_loss_mlp": 0.01020478, "balance_loss_clip": 1.03362584, "balance_loss_mlp": 1.01765239, "epoch": 0.23502179467909215, "flos": 55868062896000.0, "grad_norm": 1.0307072678924762, "language_loss": 0.73359185, "learning_rate": 3.5750272534604367e-06, "loss": 0.75432676, "num_input_tokens_seen": 84165515, "step": 3909, "time_per_iteration": 2.974531650543213 }, { "auxiliary_loss_clip": 0.01136436, "auxiliary_loss_mlp": 0.01046445, "balance_loss_clip": 1.05006361, "balance_loss_mlp": 1.02797985, "epoch": 0.23508191793176011, "flos": 23401763210880.0, "grad_norm": 1.6771333047394956, "language_loss": 0.88288009, "learning_rate": 3.5747871994400822e-06, "loss": 0.90470886, "num_input_tokens_seen": 84184540, "step": 3910, "time_per_iteration": 2.6615123748779297 }, { "auxiliary_loss_clip": 0.01134757, "auxiliary_loss_mlp": 0.01038734, "balance_loss_clip": 1.04980493, "balance_loss_mlp": 1.02188933, "epoch": 0.23514204118442808, "flos": 20047671025920.0, "grad_norm": 1.9388895528834493, "language_loss": 0.76067305, "learning_rate": 3.5745470857036386e-06, "loss": 0.78240794, "num_input_tokens_seen": 84202025, "step": 3911, "time_per_iteration": 2.6846752166748047 }, { "auxiliary_loss_clip": 0.01130294, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.04968345, "balance_loss_mlp": 1.02546179, "epoch": 0.23520216443709605, "flos": 21580589796480.0, "grad_norm": 1.5851255377793763, "language_loss": 0.81651384, "learning_rate": 3.5743069122602122e-06, "loss": 0.83823043, "num_input_tokens_seen": 84221895, "step": 3912, "time_per_iteration": 2.6340627670288086 }, { "auxiliary_loss_clip": 0.01123815, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.05082059, "balance_loss_mlp": 1.02836537, "epoch": 0.235262287689764, "flos": 23185796688000.0, "grad_norm": 3.1390338867327165, "language_loss": 0.71748006, "learning_rate": 3.574066679118909e-06, "loss": 0.73918045, "num_input_tokens_seen": 84240455, "step": 3913, "time_per_iteration": 2.6716067790985107 }, { "auxiliary_loss_clip": 0.01141007, "auxiliary_loss_mlp": 0.00776535, "balance_loss_clip": 1.05018401, "balance_loss_mlp": 1.00136077, "epoch": 0.23532241094243198, "flos": 23185222070400.0, "grad_norm": 1.7080087282408476, "language_loss": 0.76152158, "learning_rate": 3.57382638628884e-06, "loss": 0.78069693, "num_input_tokens_seen": 84261605, "step": 3914, "time_per_iteration": 2.706982135772705 }, { "auxiliary_loss_clip": 0.01088532, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.0485754, "balance_loss_mlp": 1.02153206, "epoch": 0.23538253419509997, "flos": 17019324305280.0, "grad_norm": 2.2148128973951877, "language_loss": 0.89692557, "learning_rate": 3.5735860337791174e-06, "loss": 0.91820902, "num_input_tokens_seen": 84278675, "step": 3915, "time_per_iteration": 2.8005998134613037 }, { "auxiliary_loss_clip": 0.01045613, "auxiliary_loss_mlp": 0.0100868, "balance_loss_clip": 1.02860212, "balance_loss_mlp": 1.00596201, "epoch": 0.23544265744776793, "flos": 63448588967040.0, "grad_norm": 0.8066012642326402, "language_loss": 0.59382623, "learning_rate": 3.573345621598854e-06, "loss": 0.61436915, "num_input_tokens_seen": 84329765, "step": 3916, "time_per_iteration": 3.168708086013794 }, { "auxiliary_loss_clip": 0.01027738, "auxiliary_loss_mlp": 0.01005192, "balance_loss_clip": 1.03619492, "balance_loss_mlp": 1.00231957, "epoch": 0.2355027807004359, "flos": 70515343831680.0, "grad_norm": 0.7680467252570666, "language_loss": 0.49518228, "learning_rate": 3.5731051497571675e-06, "loss": 0.51551157, "num_input_tokens_seen": 84393680, "step": 3917, "time_per_iteration": 3.3240060806274414 }, { "auxiliary_loss_clip": 0.01112941, "auxiliary_loss_mlp": 0.01048231, "balance_loss_clip": 1.04929173, "balance_loss_mlp": 1.03133857, "epoch": 0.23556290395310386, "flos": 21434289701760.0, "grad_norm": 1.9721662885337694, "language_loss": 0.76349282, "learning_rate": 3.5728646182631756e-06, "loss": 0.78510457, "num_input_tokens_seen": 84412640, "step": 3918, "time_per_iteration": 2.739431619644165 }, { "auxiliary_loss_clip": 0.0109904, "auxiliary_loss_mlp": 0.01052049, "balance_loss_clip": 1.04440236, "balance_loss_mlp": 1.03514528, "epoch": 0.23562302720577183, "flos": 18186421011840.0, "grad_norm": 2.001330675769641, "language_loss": 0.69002521, "learning_rate": 3.5726240271259995e-06, "loss": 0.71153617, "num_input_tokens_seen": 84431605, "step": 3919, "time_per_iteration": 2.8809926509857178 }, { "auxiliary_loss_clip": 0.01106851, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.04772878, "balance_loss_mlp": 1.02221501, "epoch": 0.2356831504584398, "flos": 33730497832320.0, "grad_norm": 1.6908780146896767, "language_loss": 0.70500779, "learning_rate": 3.5723833763547634e-06, "loss": 0.72646987, "num_input_tokens_seen": 84454210, "step": 3920, "time_per_iteration": 2.7984554767608643 }, { "auxiliary_loss_clip": 0.01124832, "auxiliary_loss_mlp": 0.01054073, "balance_loss_clip": 1.05141807, "balance_loss_mlp": 1.03756285, "epoch": 0.23574327371110776, "flos": 24932778560640.0, "grad_norm": 1.7460619151295316, "language_loss": 0.77363533, "learning_rate": 3.5721426659585916e-06, "loss": 0.7954244, "num_input_tokens_seen": 84475540, "step": 3921, "time_per_iteration": 2.8038690090179443 }, { "auxiliary_loss_clip": 0.01113499, "auxiliary_loss_mlp": 0.01043793, "balance_loss_clip": 1.05042887, "balance_loss_mlp": 1.02692485, "epoch": 0.23580339696377575, "flos": 17822107319040.0, "grad_norm": 2.2761735813493775, "language_loss": 0.74768102, "learning_rate": 3.571901895946612e-06, "loss": 0.76925397, "num_input_tokens_seen": 84494580, "step": 3922, "time_per_iteration": 5.741380929946899 }, { "auxiliary_loss_clip": 0.01116057, "auxiliary_loss_mlp": 0.01041318, "balance_loss_clip": 1.04831624, "balance_loss_mlp": 1.02577269, "epoch": 0.23586352021644372, "flos": 26286611097600.0, "grad_norm": 3.3386441952016868, "language_loss": 0.79846609, "learning_rate": 3.571661066327956e-06, "loss": 0.82003981, "num_input_tokens_seen": 84513850, "step": 3923, "time_per_iteration": 2.7889180183410645 }, { "auxiliary_loss_clip": 0.01089456, "auxiliary_loss_mlp": 0.0105728, "balance_loss_clip": 1.04471469, "balance_loss_mlp": 1.03935063, "epoch": 0.23592364346911168, "flos": 14246697484800.0, "grad_norm": 4.698975622885271, "language_loss": 0.74874711, "learning_rate": 3.571420177111754e-06, "loss": 0.77021456, "num_input_tokens_seen": 84532315, "step": 3924, "time_per_iteration": 4.272740125656128 }, { "auxiliary_loss_clip": 0.01145554, "auxiliary_loss_mlp": 0.01046876, "balance_loss_clip": 1.05115998, "balance_loss_mlp": 1.030568, "epoch": 0.23598376672177965, "flos": 18587938216320.0, "grad_norm": 2.8676741031402977, "language_loss": 0.82357788, "learning_rate": 3.5711792283071416e-06, "loss": 0.8455022, "num_input_tokens_seen": 84550970, "step": 3925, "time_per_iteration": 2.6825013160705566 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01048071, "balance_loss_clip": 1.04567564, "balance_loss_mlp": 1.0315721, "epoch": 0.2360438899744476, "flos": 22675542036480.0, "grad_norm": 1.5755651433289561, "language_loss": 0.59533024, "learning_rate": 3.5709382199232564e-06, "loss": 0.61701441, "num_input_tokens_seen": 84571655, "step": 3926, "time_per_iteration": 2.6960842609405518 }, { "auxiliary_loss_clip": 0.01125496, "auxiliary_loss_mlp": 0.01046163, "balance_loss_clip": 1.04914129, "balance_loss_mlp": 1.0302484, "epoch": 0.23610401322711558, "flos": 29570139014400.0, "grad_norm": 2.4179456581838212, "language_loss": 0.7155292, "learning_rate": 3.570697151969235e-06, "loss": 0.7372458, "num_input_tokens_seen": 84593130, "step": 3927, "time_per_iteration": 2.786576986312866 }, { "auxiliary_loss_clip": 0.01120941, "auxiliary_loss_mlp": 0.01047009, "balance_loss_clip": 1.04764938, "balance_loss_mlp": 1.03125572, "epoch": 0.23616413647978354, "flos": 17858520731520.0, "grad_norm": 1.9380358164668718, "language_loss": 0.74792278, "learning_rate": 3.570456024454221e-06, "loss": 0.76960224, "num_input_tokens_seen": 84612410, "step": 3928, "time_per_iteration": 4.450765609741211 }, { "auxiliary_loss_clip": 0.01118656, "auxiliary_loss_mlp": 0.01047112, "balance_loss_clip": 1.04935324, "balance_loss_mlp": 1.02949333, "epoch": 0.23622425973245154, "flos": 11034847157760.0, "grad_norm": 4.3448767989564745, "language_loss": 0.81905198, "learning_rate": 3.5702148373873576e-06, "loss": 0.84070963, "num_input_tokens_seen": 84627610, "step": 3929, "time_per_iteration": 2.654085874557495 }, { "auxiliary_loss_clip": 0.01151721, "auxiliary_loss_mlp": 0.0105167, "balance_loss_clip": 1.05143714, "balance_loss_mlp": 1.03314447, "epoch": 0.2362843829851195, "flos": 23404061681280.0, "grad_norm": 3.048788180104446, "language_loss": 0.72323942, "learning_rate": 3.569973590777789e-06, "loss": 0.74527335, "num_input_tokens_seen": 84648415, "step": 3930, "time_per_iteration": 2.67429780960083 }, { "auxiliary_loss_clip": 0.01143652, "auxiliary_loss_mlp": 0.01036151, "balance_loss_clip": 1.04880345, "balance_loss_mlp": 1.01985574, "epoch": 0.23634450623778747, "flos": 39529855261440.0, "grad_norm": 2.7450987997323333, "language_loss": 0.74105632, "learning_rate": 3.569732284634665e-06, "loss": 0.76285434, "num_input_tokens_seen": 84670080, "step": 3931, "time_per_iteration": 2.8017847537994385 }, { "auxiliary_loss_clip": 0.01137617, "auxiliary_loss_mlp": 0.01046002, "balance_loss_clip": 1.05250037, "balance_loss_mlp": 1.02853799, "epoch": 0.23640462949045543, "flos": 24207167917440.0, "grad_norm": 2.2419024865888852, "language_loss": 0.8018778, "learning_rate": 3.569490918967136e-06, "loss": 0.82371396, "num_input_tokens_seen": 84686465, "step": 3932, "time_per_iteration": 2.6295793056488037 }, { "auxiliary_loss_clip": 0.01108498, "auxiliary_loss_mlp": 0.0104053, "balance_loss_clip": 1.04981244, "balance_loss_mlp": 1.02614117, "epoch": 0.2364647527431234, "flos": 26177622255360.0, "grad_norm": 2.247824561482015, "language_loss": 0.85683465, "learning_rate": 3.5692494937843537e-06, "loss": 0.87832487, "num_input_tokens_seen": 84708825, "step": 3933, "time_per_iteration": 2.7401201725006104 }, { "auxiliary_loss_clip": 0.01101933, "auxiliary_loss_mlp": 0.010512, "balance_loss_clip": 1.04680276, "balance_loss_mlp": 1.03112483, "epoch": 0.23652487599579136, "flos": 22637009721600.0, "grad_norm": 2.0287283132247547, "language_loss": 0.83179402, "learning_rate": 3.5690080090954727e-06, "loss": 0.85332537, "num_input_tokens_seen": 84726165, "step": 3934, "time_per_iteration": 2.8152921199798584 }, { "auxiliary_loss_clip": 0.01148508, "auxiliary_loss_mlp": 0.01042164, "balance_loss_clip": 1.05208373, "balance_loss_mlp": 1.02556968, "epoch": 0.23658499924845935, "flos": 21762261809280.0, "grad_norm": 1.8368151879100059, "language_loss": 0.78513408, "learning_rate": 3.5687664649096515e-06, "loss": 0.80704081, "num_input_tokens_seen": 84745815, "step": 3935, "time_per_iteration": 2.6769750118255615 }, { "auxiliary_loss_clip": 0.01134595, "auxiliary_loss_mlp": 0.01034926, "balance_loss_clip": 1.05270088, "balance_loss_mlp": 1.01891589, "epoch": 0.23664512250112732, "flos": 21798998444160.0, "grad_norm": 1.5615220666884744, "language_loss": 0.79614085, "learning_rate": 3.5685248612360487e-06, "loss": 0.81783605, "num_input_tokens_seen": 84765415, "step": 3936, "time_per_iteration": 2.7037193775177 }, { "auxiliary_loss_clip": 0.01126163, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.04967618, "balance_loss_mlp": 1.01779902, "epoch": 0.23670524575379528, "flos": 22637871648000.0, "grad_norm": 1.671201383656535, "language_loss": 0.7915628, "learning_rate": 3.568283198083826e-06, "loss": 0.81317174, "num_input_tokens_seen": 84787080, "step": 3937, "time_per_iteration": 2.7639834880828857 }, { "auxiliary_loss_clip": 0.01134519, "auxiliary_loss_mlp": 0.01038533, "balance_loss_clip": 1.05320358, "balance_loss_mlp": 1.02313685, "epoch": 0.23676536900646325, "flos": 16725000263040.0, "grad_norm": 1.8758026172480324, "language_loss": 0.85389286, "learning_rate": 3.568041475462147e-06, "loss": 0.8756234, "num_input_tokens_seen": 84805395, "step": 3938, "time_per_iteration": 2.6919057369232178 }, { "auxiliary_loss_clip": 0.01145522, "auxiliary_loss_mlp": 0.01047488, "balance_loss_clip": 1.05159402, "balance_loss_mlp": 1.03076303, "epoch": 0.23682549225913122, "flos": 11135611785600.0, "grad_norm": 4.660879571039018, "language_loss": 0.9365679, "learning_rate": 3.5677996933801785e-06, "loss": 0.958498, "num_input_tokens_seen": 84818090, "step": 3939, "time_per_iteration": 2.7249948978424072 }, { "auxiliary_loss_clip": 0.01149288, "auxiliary_loss_mlp": 0.01041833, "balance_loss_clip": 1.0512023, "balance_loss_mlp": 1.02463138, "epoch": 0.23688561551179918, "flos": 22559226819840.0, "grad_norm": 1.884439522765895, "language_loss": 0.82347792, "learning_rate": 3.567557851847088e-06, "loss": 0.84538913, "num_input_tokens_seen": 84837695, "step": 3940, "time_per_iteration": 2.666647434234619 }, { "auxiliary_loss_clip": 0.01128412, "auxiliary_loss_mlp": 0.00775407, "balance_loss_clip": 1.05063081, "balance_loss_mlp": 1.00109661, "epoch": 0.23694573876446715, "flos": 18514895909760.0, "grad_norm": 2.7155330970608214, "language_loss": 0.88959104, "learning_rate": 3.5673159508720464e-06, "loss": 0.90862918, "num_input_tokens_seen": 84854630, "step": 3941, "time_per_iteration": 2.6898627281188965 }, { "auxiliary_loss_clip": 0.01147095, "auxiliary_loss_mlp": 0.01040548, "balance_loss_clip": 1.04976177, "balance_loss_mlp": 1.0227741, "epoch": 0.23700586201713514, "flos": 15335723980800.0, "grad_norm": 2.436898535695529, "language_loss": 0.8484506, "learning_rate": 3.5670739904642274e-06, "loss": 0.870327, "num_input_tokens_seen": 84871805, "step": 3942, "time_per_iteration": 2.560166835784912 }, { "auxiliary_loss_clip": 0.01109105, "auxiliary_loss_mlp": 0.01042997, "balance_loss_clip": 1.04736543, "balance_loss_mlp": 1.02447248, "epoch": 0.2370659852698031, "flos": 23947605262080.0, "grad_norm": 1.9848651824816348, "language_loss": 0.81126499, "learning_rate": 3.5668319706328065e-06, "loss": 0.83278596, "num_input_tokens_seen": 84889815, "step": 3943, "time_per_iteration": 2.7389075756073 }, { "auxiliary_loss_clip": 0.01114013, "auxiliary_loss_mlp": 0.01044642, "balance_loss_clip": 1.0464983, "balance_loss_mlp": 1.02618814, "epoch": 0.23712610852247107, "flos": 15332527670400.0, "grad_norm": 2.1611381488400143, "language_loss": 0.67060351, "learning_rate": 3.566589891386959e-06, "loss": 0.69219005, "num_input_tokens_seen": 84904380, "step": 3944, "time_per_iteration": 2.6382999420166016 }, { "auxiliary_loss_clip": 0.01117531, "auxiliary_loss_mlp": 0.01038157, "balance_loss_clip": 1.04629564, "balance_loss_mlp": 1.02003753, "epoch": 0.23718623177513903, "flos": 19682567233920.0, "grad_norm": 1.9578725621632602, "language_loss": 0.75573617, "learning_rate": 3.566347752735866e-06, "loss": 0.77729309, "num_input_tokens_seen": 84922935, "step": 3945, "time_per_iteration": 2.678377628326416 }, { "auxiliary_loss_clip": 0.01128604, "auxiliary_loss_mlp": 0.01039043, "balance_loss_clip": 1.0493716, "balance_loss_mlp": 1.02255654, "epoch": 0.237246355027807, "flos": 24973322037120.0, "grad_norm": 1.4378865328543082, "language_loss": 0.63750178, "learning_rate": 3.5661055546887094e-06, "loss": 0.65917826, "num_input_tokens_seen": 84943685, "step": 3946, "time_per_iteration": 2.77178955078125 }, { "auxiliary_loss_clip": 0.01130702, "auxiliary_loss_mlp": 0.01036796, "balance_loss_clip": 1.0460459, "balance_loss_mlp": 1.0186162, "epoch": 0.23730647828047496, "flos": 15377416692480.0, "grad_norm": 2.53957699605931, "language_loss": 0.77666485, "learning_rate": 3.5658632972546734e-06, "loss": 0.79833984, "num_input_tokens_seen": 84959505, "step": 3947, "time_per_iteration": 2.65461802482605 }, { "auxiliary_loss_clip": 0.01145835, "auxiliary_loss_mlp": 0.01040502, "balance_loss_clip": 1.0566994, "balance_loss_mlp": 1.02299047, "epoch": 0.23736660153314296, "flos": 28150662372480.0, "grad_norm": 2.0053805098120123, "language_loss": 0.80706096, "learning_rate": 3.565620980442944e-06, "loss": 0.82892442, "num_input_tokens_seen": 84982130, "step": 3948, "time_per_iteration": 2.756716012954712 }, { "auxiliary_loss_clip": 0.01129664, "auxiliary_loss_mlp": 0.01044051, "balance_loss_clip": 1.05104828, "balance_loss_mlp": 1.02643192, "epoch": 0.23742672478581092, "flos": 22086570729600.0, "grad_norm": 2.5980612684471374, "language_loss": 0.80257607, "learning_rate": 3.5653786042627107e-06, "loss": 0.82431316, "num_input_tokens_seen": 85000640, "step": 3949, "time_per_iteration": 2.74457049369812 }, { "auxiliary_loss_clip": 0.0112363, "auxiliary_loss_mlp": 0.01038665, "balance_loss_clip": 1.04977036, "balance_loss_mlp": 1.02109337, "epoch": 0.2374868480384789, "flos": 19537093152000.0, "grad_norm": 2.0592081961125093, "language_loss": 0.73239946, "learning_rate": 3.565136168723163e-06, "loss": 0.75402236, "num_input_tokens_seen": 85018970, "step": 3950, "time_per_iteration": 2.650508165359497 }, { "auxiliary_loss_clip": 0.01145426, "auxiliary_loss_mlp": 0.01037947, "balance_loss_clip": 1.05055118, "balance_loss_mlp": 1.02204442, "epoch": 0.23754697129114685, "flos": 19422501788160.0, "grad_norm": 1.9969465766046124, "language_loss": 0.72794384, "learning_rate": 3.564893673833495e-06, "loss": 0.74977756, "num_input_tokens_seen": 85035905, "step": 3951, "time_per_iteration": 2.652399778366089 }, { "auxiliary_loss_clip": 0.01122477, "auxiliary_loss_mlp": 0.01039445, "balance_loss_clip": 1.05080223, "balance_loss_mlp": 1.0216229, "epoch": 0.23760709454381482, "flos": 19501002961920.0, "grad_norm": 3.398248459712791, "language_loss": 0.73703241, "learning_rate": 3.564651119602903e-06, "loss": 0.75865161, "num_input_tokens_seen": 85054560, "step": 3952, "time_per_iteration": 2.7522144317626953 }, { "auxiliary_loss_clip": 0.01100804, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.04366636, "balance_loss_mlp": 1.02566266, "epoch": 0.23766721779648278, "flos": 27636600879360.0, "grad_norm": 1.7524267936836437, "language_loss": 0.71314329, "learning_rate": 3.564408506040583e-06, "loss": 0.73457694, "num_input_tokens_seen": 85074425, "step": 3953, "time_per_iteration": 2.7846672534942627 }, { "auxiliary_loss_clip": 0.01151909, "auxiliary_loss_mlp": 0.01047443, "balance_loss_clip": 1.05282676, "balance_loss_mlp": 1.02854872, "epoch": 0.23772734104915075, "flos": 23404348990080.0, "grad_norm": 1.9722222736847754, "language_loss": 0.81792426, "learning_rate": 3.5641658331557356e-06, "loss": 0.83991784, "num_input_tokens_seen": 85092865, "step": 3954, "time_per_iteration": 2.6262643337249756 }, { "auxiliary_loss_clip": 0.01127802, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05239391, "balance_loss_mlp": 1.02616453, "epoch": 0.23778746430181874, "flos": 15705496540800.0, "grad_norm": 2.2607510345904824, "language_loss": 0.66270143, "learning_rate": 3.5639231009575634e-06, "loss": 0.68442386, "num_input_tokens_seen": 85110175, "step": 3955, "time_per_iteration": 2.672151803970337 }, { "auxiliary_loss_clip": 0.01149182, "auxiliary_loss_mlp": 0.0104812, "balance_loss_clip": 1.05219805, "balance_loss_mlp": 1.03104961, "epoch": 0.2378475875544867, "flos": 19426452284160.0, "grad_norm": 1.4117933502593074, "language_loss": 0.83963013, "learning_rate": 3.5636803094552704e-06, "loss": 0.86160314, "num_input_tokens_seen": 85129925, "step": 3956, "time_per_iteration": 2.6483681201934814 }, { "auxiliary_loss_clip": 0.01103304, "auxiliary_loss_mlp": 0.01042938, "balance_loss_clip": 1.04726648, "balance_loss_mlp": 1.02556944, "epoch": 0.23790771080715467, "flos": 22268565964800.0, "grad_norm": 2.308539718278817, "language_loss": 0.8482393, "learning_rate": 3.5634374586580635e-06, "loss": 0.86970174, "num_input_tokens_seen": 85147755, "step": 3957, "time_per_iteration": 2.718961715698242 }, { "auxiliary_loss_clip": 0.01087747, "auxiliary_loss_mlp": 0.01039974, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.02428651, "epoch": 0.23796783405982264, "flos": 20047311889920.0, "grad_norm": 2.068360920278316, "language_loss": 0.70373344, "learning_rate": 3.563194548575151e-06, "loss": 0.72501063, "num_input_tokens_seen": 85165270, "step": 3958, "time_per_iteration": 2.818115472793579 }, { "auxiliary_loss_clip": 0.01102632, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.04540312, "balance_loss_mlp": 1.02276158, "epoch": 0.2380279573124906, "flos": 14245943299200.0, "grad_norm": 2.474231994209954, "language_loss": 0.66273189, "learning_rate": 3.562951579215745e-06, "loss": 0.68417823, "num_input_tokens_seen": 85181555, "step": 3959, "time_per_iteration": 2.71085786819458 }, { "auxiliary_loss_clip": 0.01103257, "auxiliary_loss_mlp": 0.01044748, "balance_loss_clip": 1.04910731, "balance_loss_mlp": 1.02760553, "epoch": 0.23808808056515857, "flos": 21179180332800.0, "grad_norm": 1.922923950627842, "language_loss": 0.72140026, "learning_rate": 3.5627085505890586e-06, "loss": 0.74288028, "num_input_tokens_seen": 85199455, "step": 3960, "time_per_iteration": 2.724398612976074 }, { "auxiliary_loss_clip": 0.01065725, "auxiliary_loss_mlp": 0.01041352, "balance_loss_clip": 1.04778433, "balance_loss_mlp": 1.02385175, "epoch": 0.23814820381782653, "flos": 22528308188160.0, "grad_norm": 1.836282299199184, "language_loss": 0.74303818, "learning_rate": 3.562465462704307e-06, "loss": 0.76410902, "num_input_tokens_seen": 85219170, "step": 3961, "time_per_iteration": 4.592544794082642 }, { "auxiliary_loss_clip": 0.01149701, "auxiliary_loss_mlp": 0.010511, "balance_loss_clip": 1.05083704, "balance_loss_mlp": 1.0321815, "epoch": 0.23820832707049452, "flos": 22304332932480.0, "grad_norm": 1.6798300631958207, "language_loss": 0.6562922, "learning_rate": 3.5622223155707085e-06, "loss": 0.67830026, "num_input_tokens_seen": 85238480, "step": 3962, "time_per_iteration": 4.40812087059021 }, { "auxiliary_loss_clip": 0.01121684, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.04743505, "balance_loss_mlp": 1.02511263, "epoch": 0.2382684503231625, "flos": 24864225454080.0, "grad_norm": 1.838705722688445, "language_loss": 0.74284148, "learning_rate": 3.561979109197483e-06, "loss": 0.76448429, "num_input_tokens_seen": 85259180, "step": 3963, "time_per_iteration": 2.7173969745635986 }, { "auxiliary_loss_clip": 0.01120014, "auxiliary_loss_mlp": 0.01045721, "balance_loss_clip": 1.0530858, "balance_loss_mlp": 1.02756512, "epoch": 0.23832857357583045, "flos": 21871609787520.0, "grad_norm": 2.045875790034744, "language_loss": 0.77264321, "learning_rate": 3.5617358435938538e-06, "loss": 0.79430056, "num_input_tokens_seen": 85278550, "step": 3964, "time_per_iteration": 4.25124716758728 }, { "auxiliary_loss_clip": 0.01108604, "auxiliary_loss_mlp": 0.01048343, "balance_loss_clip": 1.04783297, "balance_loss_mlp": 1.03124809, "epoch": 0.23838869682849842, "flos": 21288061434240.0, "grad_norm": 2.3097885565999894, "language_loss": 0.71521109, "learning_rate": 3.561492518769045e-06, "loss": 0.73678052, "num_input_tokens_seen": 85297345, "step": 3965, "time_per_iteration": 2.757647752761841 }, { "auxiliary_loss_clip": 0.01115176, "auxiliary_loss_mlp": 0.01043319, "balance_loss_clip": 1.04632521, "balance_loss_mlp": 1.02647483, "epoch": 0.23844882008116638, "flos": 16180594755840.0, "grad_norm": 2.673966650516871, "language_loss": 0.78003007, "learning_rate": 3.561249134732282e-06, "loss": 0.801615, "num_input_tokens_seen": 85315105, "step": 3966, "time_per_iteration": 2.71159291267395 }, { "auxiliary_loss_clip": 0.01124693, "auxiliary_loss_mlp": 0.01045448, "balance_loss_clip": 1.05071902, "balance_loss_mlp": 1.02899134, "epoch": 0.23850894333383435, "flos": 21069724613760.0, "grad_norm": 2.116401462724705, "language_loss": 0.68767631, "learning_rate": 3.561005691492797e-06, "loss": 0.70937771, "num_input_tokens_seen": 85334735, "step": 3967, "time_per_iteration": 2.7072744369506836 }, { "auxiliary_loss_clip": 0.01116174, "auxiliary_loss_mlp": 0.01055757, "balance_loss_clip": 1.04883289, "balance_loss_mlp": 1.03803015, "epoch": 0.23856906658650234, "flos": 17201606849280.0, "grad_norm": 3.581336577718575, "language_loss": 0.68005061, "learning_rate": 3.5607621890598185e-06, "loss": 0.70176995, "num_input_tokens_seen": 85352875, "step": 3968, "time_per_iteration": 4.378219842910767 }, { "auxiliary_loss_clip": 0.01097883, "auxiliary_loss_mlp": 0.01044394, "balance_loss_clip": 1.05052614, "balance_loss_mlp": 1.0274837, "epoch": 0.2386291898391703, "flos": 29494223619840.0, "grad_norm": 2.210255088762028, "language_loss": 0.77106255, "learning_rate": 3.5605186274425823e-06, "loss": 0.79248536, "num_input_tokens_seen": 85372205, "step": 3969, "time_per_iteration": 2.847663164138794 }, { "auxiliary_loss_clip": 0.01121681, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.0498476, "balance_loss_mlp": 1.02334595, "epoch": 0.23868931309183827, "flos": 21142443697920.0, "grad_norm": 2.1326335149840583, "language_loss": 0.7617563, "learning_rate": 3.5602750066503225e-06, "loss": 0.78337121, "num_input_tokens_seen": 85389705, "step": 3970, "time_per_iteration": 2.766862392425537 }, { "auxiliary_loss_clip": 0.01106309, "auxiliary_loss_mlp": 0.01049131, "balance_loss_clip": 1.04287159, "balance_loss_mlp": 1.03111875, "epoch": 0.23874943634450624, "flos": 25659394784640.0, "grad_norm": 2.3319107764636415, "language_loss": 0.85474384, "learning_rate": 3.5600313266922793e-06, "loss": 0.87629819, "num_input_tokens_seen": 85407855, "step": 3971, "time_per_iteration": 2.7597670555114746 }, { "auxiliary_loss_clip": 0.01062507, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.03465796, "balance_loss_mlp": 1.03661716, "epoch": 0.2388095595971742, "flos": 58986618624000.0, "grad_norm": 0.7451796217314707, "language_loss": 0.62797832, "learning_rate": 3.5597875875776915e-06, "loss": 0.6489948, "num_input_tokens_seen": 85470885, "step": 3972, "time_per_iteration": 3.2572779655456543 }, { "auxiliary_loss_clip": 0.0112174, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.0492239, "balance_loss_mlp": 1.02109838, "epoch": 0.23886968284984217, "flos": 16800341040000.0, "grad_norm": 1.9449657433446057, "language_loss": 0.82093811, "learning_rate": 3.5595437893158013e-06, "loss": 0.84253484, "num_input_tokens_seen": 85488460, "step": 3973, "time_per_iteration": 2.6394145488739014 }, { "auxiliary_loss_clip": 0.01115852, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.04884124, "balance_loss_mlp": 1.03272736, "epoch": 0.23892980610251013, "flos": 22382654538240.0, "grad_norm": 1.5639820592628684, "language_loss": 0.79418832, "learning_rate": 3.5592999319158546e-06, "loss": 0.81584924, "num_input_tokens_seen": 85508590, "step": 3974, "time_per_iteration": 2.6926944255828857 }, { "auxiliary_loss_clip": 0.01134012, "auxiliary_loss_mlp": 0.01042703, "balance_loss_clip": 1.05169725, "balance_loss_mlp": 1.02475047, "epoch": 0.23898992935517813, "flos": 12823198519680.0, "grad_norm": 1.8382350241534648, "language_loss": 0.8420803, "learning_rate": 3.5590560153870984e-06, "loss": 0.86384743, "num_input_tokens_seen": 85525970, "step": 3975, "time_per_iteration": 2.6402463912963867 }, { "auxiliary_loss_clip": 0.01126962, "auxiliary_loss_mlp": 0.01042445, "balance_loss_clip": 1.04938245, "balance_loss_mlp": 1.02545786, "epoch": 0.2390500526078461, "flos": 22345666508160.0, "grad_norm": 2.129124681208868, "language_loss": 0.84249294, "learning_rate": 3.5588120397387816e-06, "loss": 0.864187, "num_input_tokens_seen": 85543700, "step": 3976, "time_per_iteration": 2.624758720397949 }, { "auxiliary_loss_clip": 0.01075224, "auxiliary_loss_mlp": 0.01036827, "balance_loss_clip": 1.0434798, "balance_loss_mlp": 1.02103186, "epoch": 0.23911017586051406, "flos": 22635142214400.0, "grad_norm": 1.8888081312271703, "language_loss": 0.74451673, "learning_rate": 3.5585680049801566e-06, "loss": 0.76563722, "num_input_tokens_seen": 85562765, "step": 3977, "time_per_iteration": 2.848529815673828 }, { "auxiliary_loss_clip": 0.01151335, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.05476987, "balance_loss_mlp": 1.02829063, "epoch": 0.23917029911318202, "flos": 23653281219840.0, "grad_norm": 1.6816446874821869, "language_loss": 0.72515011, "learning_rate": 3.5583239111204764e-06, "loss": 0.74712306, "num_input_tokens_seen": 85581755, "step": 3978, "time_per_iteration": 2.6967527866363525 }, { "auxiliary_loss_clip": 0.01123321, "auxiliary_loss_mlp": 0.01045192, "balance_loss_clip": 1.04713726, "balance_loss_mlp": 1.02802634, "epoch": 0.23923042236585, "flos": 22783597125120.0, "grad_norm": 2.5130493367739413, "language_loss": 0.78474021, "learning_rate": 3.558079758168997e-06, "loss": 0.80642533, "num_input_tokens_seen": 85599455, "step": 3979, "time_per_iteration": 2.6679623126983643 }, { "auxiliary_loss_clip": 0.01123187, "auxiliary_loss_mlp": 0.01052255, "balance_loss_clip": 1.04774463, "balance_loss_mlp": 1.03390861, "epoch": 0.23929054561851795, "flos": 28147717457280.0, "grad_norm": 1.8353092232149775, "language_loss": 0.81943917, "learning_rate": 3.557835546134977e-06, "loss": 0.84119362, "num_input_tokens_seen": 85619970, "step": 3980, "time_per_iteration": 2.7941136360168457 }, { "auxiliary_loss_clip": 0.01094849, "auxiliary_loss_mlp": 0.01037854, "balance_loss_clip": 1.04719615, "balance_loss_mlp": 1.02036595, "epoch": 0.23935066887118592, "flos": 21686525982720.0, "grad_norm": 1.7388406045293963, "language_loss": 0.83562148, "learning_rate": 3.5575912750276775e-06, "loss": 0.85694849, "num_input_tokens_seen": 85638850, "step": 3981, "time_per_iteration": 2.773372173309326 }, { "auxiliary_loss_clip": 0.01126579, "auxiliary_loss_mlp": 0.01045152, "balance_loss_clip": 1.05084574, "balance_loss_mlp": 1.0267818, "epoch": 0.2394107921238539, "flos": 32122274198400.0, "grad_norm": 2.0270942419393676, "language_loss": 0.76690662, "learning_rate": 3.5573469448563607e-06, "loss": 0.78862393, "num_input_tokens_seen": 85656285, "step": 3982, "time_per_iteration": 2.770089864730835 }, { "auxiliary_loss_clip": 0.01107786, "auxiliary_loss_mlp": 0.01043737, "balance_loss_clip": 1.04928303, "balance_loss_mlp": 1.02757215, "epoch": 0.23947091537652188, "flos": 17019180650880.0, "grad_norm": 2.333665248317953, "language_loss": 0.78243405, "learning_rate": 3.5571025556302915e-06, "loss": 0.80394924, "num_input_tokens_seen": 85673020, "step": 3983, "time_per_iteration": 2.8361902236938477 }, { "auxiliary_loss_clip": 0.01136012, "auxiliary_loss_mlp": 0.00775416, "balance_loss_clip": 1.0530262, "balance_loss_mlp": 1.00106907, "epoch": 0.23953103862918984, "flos": 20593584904320.0, "grad_norm": 1.8468424363822287, "language_loss": 0.73274761, "learning_rate": 3.556858107358737e-06, "loss": 0.75186193, "num_input_tokens_seen": 85692565, "step": 3984, "time_per_iteration": 2.720289468765259 }, { "auxiliary_loss_clip": 0.01102619, "auxiliary_loss_mlp": 0.01051209, "balance_loss_clip": 1.04748976, "balance_loss_mlp": 1.0330658, "epoch": 0.2395911618818578, "flos": 20704405340160.0, "grad_norm": 1.906378165207968, "language_loss": 0.79090226, "learning_rate": 3.5566136000509674e-06, "loss": 0.81244051, "num_input_tokens_seen": 85709730, "step": 3985, "time_per_iteration": 2.8464138507843018 }, { "auxiliary_loss_clip": 0.01102898, "auxiliary_loss_mlp": 0.01047238, "balance_loss_clip": 1.04676175, "balance_loss_mlp": 1.02930927, "epoch": 0.23965128513452577, "flos": 27053519402880.0, "grad_norm": 1.780185130038595, "language_loss": 0.73194253, "learning_rate": 3.556369033716254e-06, "loss": 0.7534439, "num_input_tokens_seen": 85730045, "step": 3986, "time_per_iteration": 2.873837471008301 }, { "auxiliary_loss_clip": 0.01143561, "auxiliary_loss_mlp": 0.01052533, "balance_loss_clip": 1.05392861, "balance_loss_mlp": 1.03523529, "epoch": 0.23971140838719374, "flos": 23144319457920.0, "grad_norm": 1.9275946084378768, "language_loss": 0.88014174, "learning_rate": 3.556124408363871e-06, "loss": 0.90210271, "num_input_tokens_seen": 85747590, "step": 3987, "time_per_iteration": 2.778970718383789 }, { "auxiliary_loss_clip": 0.01131181, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.05180991, "balance_loss_mlp": 1.02253985, "epoch": 0.23977153163986173, "flos": 18034554309120.0, "grad_norm": 8.94948058332038, "language_loss": 0.82985806, "learning_rate": 3.5558797240030945e-06, "loss": 0.85154212, "num_input_tokens_seen": 85763460, "step": 3988, "time_per_iteration": 2.6707162857055664 }, { "auxiliary_loss_clip": 0.01132219, "auxiliary_loss_mlp": 0.01039377, "balance_loss_clip": 1.04952908, "balance_loss_mlp": 1.02213907, "epoch": 0.2398316548925297, "flos": 18113378705280.0, "grad_norm": 1.6085860818119202, "language_loss": 0.85336304, "learning_rate": 3.5556349806432035e-06, "loss": 0.87507904, "num_input_tokens_seen": 85782050, "step": 3989, "time_per_iteration": 2.644075632095337 }, { "auxiliary_loss_clip": 0.01144734, "auxiliary_loss_mlp": 0.01039049, "balance_loss_clip": 1.05094743, "balance_loss_mlp": 1.02263403, "epoch": 0.23989177814519766, "flos": 12567730014720.0, "grad_norm": 1.981474679784042, "language_loss": 0.84109843, "learning_rate": 3.555390178293477e-06, "loss": 0.86293626, "num_input_tokens_seen": 85797400, "step": 3990, "time_per_iteration": 2.5778160095214844 }, { "auxiliary_loss_clip": 0.01131361, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.04863191, "balance_loss_mlp": 1.02565074, "epoch": 0.23995190139786562, "flos": 25264593423360.0, "grad_norm": 1.5352138463261382, "language_loss": 0.75853264, "learning_rate": 3.5551453169631994e-06, "loss": 0.78026724, "num_input_tokens_seen": 85818995, "step": 3991, "time_per_iteration": 2.7569639682769775 }, { "auxiliary_loss_clip": 0.01040828, "auxiliary_loss_mlp": 0.0100398, "balance_loss_clip": 1.02825403, "balance_loss_mlp": 1.00114298, "epoch": 0.2400120246505336, "flos": 61960379650560.0, "grad_norm": 0.8795356934357302, "language_loss": 0.63683558, "learning_rate": 3.554900396661656e-06, "loss": 0.65728366, "num_input_tokens_seen": 85876695, "step": 3992, "time_per_iteration": 3.2559213638305664 }, { "auxiliary_loss_clip": 0.01055123, "auxiliary_loss_mlp": 0.01005737, "balance_loss_clip": 1.02834392, "balance_loss_mlp": 1.00292385, "epoch": 0.24007214790320155, "flos": 66708560540160.0, "grad_norm": 0.7639831296699208, "language_loss": 0.6297875, "learning_rate": 3.5546554173981334e-06, "loss": 0.65039611, "num_input_tokens_seen": 85940990, "step": 3993, "time_per_iteration": 3.2946221828460693 }, { "auxiliary_loss_clip": 0.0110983, "auxiliary_loss_mlp": 0.01048609, "balance_loss_clip": 1.05077267, "balance_loss_mlp": 1.03078759, "epoch": 0.24013227115586952, "flos": 25809070757760.0, "grad_norm": 1.7227387633537015, "language_loss": 0.7656548, "learning_rate": 3.5544103791819218e-06, "loss": 0.78723919, "num_input_tokens_seen": 85961165, "step": 3994, "time_per_iteration": 2.7735466957092285 }, { "auxiliary_loss_clip": 0.01120115, "auxiliary_loss_mlp": 0.01051235, "balance_loss_clip": 1.04648936, "balance_loss_mlp": 1.0323168, "epoch": 0.2401923944085375, "flos": 25557480921600.0, "grad_norm": 1.7819538389347498, "language_loss": 0.78550023, "learning_rate": 3.5541652820223124e-06, "loss": 0.80721372, "num_input_tokens_seen": 85982710, "step": 3995, "time_per_iteration": 2.8184118270874023 }, { "auxiliary_loss_clip": 0.01034, "auxiliary_loss_mlp": 0.01026353, "balance_loss_clip": 1.02876425, "balance_loss_mlp": 1.0237658, "epoch": 0.24025251766120548, "flos": 54941138478720.0, "grad_norm": 0.9088717203971356, "language_loss": 0.6345036, "learning_rate": 3.5539201259286006e-06, "loss": 0.65510708, "num_input_tokens_seen": 86046935, "step": 3996, "time_per_iteration": 3.304704189300537 }, { "auxiliary_loss_clip": 0.01122635, "auxiliary_loss_mlp": 0.01046678, "balance_loss_clip": 1.04812241, "balance_loss_mlp": 1.02960706, "epoch": 0.24031264091387344, "flos": 20631075724800.0, "grad_norm": 2.5673853359086403, "language_loss": 0.69455099, "learning_rate": 3.5536749109100808e-06, "loss": 0.7162441, "num_input_tokens_seen": 86064355, "step": 3997, "time_per_iteration": 2.6638269424438477 }, { "auxiliary_loss_clip": 0.01136246, "auxiliary_loss_mlp": 0.01041204, "balance_loss_clip": 1.0500989, "balance_loss_mlp": 1.02390659, "epoch": 0.2403727641665414, "flos": 20886256920960.0, "grad_norm": 1.9944619018673675, "language_loss": 0.87352818, "learning_rate": 3.5534296369760535e-06, "loss": 0.89530265, "num_input_tokens_seen": 86081340, "step": 3998, "time_per_iteration": 2.6837756633758545 }, { "auxiliary_loss_clip": 0.01126262, "auxiliary_loss_mlp": 0.01038814, "balance_loss_clip": 1.04337883, "balance_loss_mlp": 1.02173114, "epoch": 0.24043288741920937, "flos": 22820046451200.0, "grad_norm": 1.5798261831400109, "language_loss": 0.75723118, "learning_rate": 3.5531843041358183e-06, "loss": 0.77888191, "num_input_tokens_seen": 86102260, "step": 3999, "time_per_iteration": 2.659717321395874 }, { "auxiliary_loss_clip": 0.01116532, "auxiliary_loss_mlp": 0.01049627, "balance_loss_clip": 1.04679537, "balance_loss_mlp": 1.03259242, "epoch": 0.24049301067187734, "flos": 27959652823680.0, "grad_norm": 2.380373207595884, "language_loss": 0.72602308, "learning_rate": 3.552938912398679e-06, "loss": 0.74768472, "num_input_tokens_seen": 86123400, "step": 4000, "time_per_iteration": 4.285717487335205 }, { "auxiliary_loss_clip": 0.01138397, "auxiliary_loss_mlp": 0.01040819, "balance_loss_clip": 1.05207551, "balance_loss_mlp": 1.02389169, "epoch": 0.24055313392454533, "flos": 27451409333760.0, "grad_norm": 2.3105318706157862, "language_loss": 0.67128104, "learning_rate": 3.5526934617739397e-06, "loss": 0.69307321, "num_input_tokens_seen": 86144060, "step": 4001, "time_per_iteration": 4.2180609703063965 }, { "auxiliary_loss_clip": 0.01144863, "auxiliary_loss_mlp": 0.01043304, "balance_loss_clip": 1.04859209, "balance_loss_mlp": 1.02525568, "epoch": 0.2406132571772133, "flos": 25556618995200.0, "grad_norm": 2.360624564793828, "language_loss": 0.82895994, "learning_rate": 3.5524479522709095e-06, "loss": 0.85084158, "num_input_tokens_seen": 86163005, "step": 4002, "time_per_iteration": 2.6369640827178955 }, { "auxiliary_loss_clip": 0.01106477, "auxiliary_loss_mlp": 0.01045072, "balance_loss_clip": 1.0493201, "balance_loss_mlp": 1.0283823, "epoch": 0.24067338042988126, "flos": 24791398629120.0, "grad_norm": 2.016027139567785, "language_loss": 0.83058953, "learning_rate": 3.552202383898897e-06, "loss": 0.85210502, "num_input_tokens_seen": 86182580, "step": 4003, "time_per_iteration": 4.312098979949951 }, { "auxiliary_loss_clip": 0.01114745, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.0474503, "balance_loss_mlp": 1.02458131, "epoch": 0.24073350368254923, "flos": 21177923356800.0, "grad_norm": 1.971328156333658, "language_loss": 0.8672772, "learning_rate": 3.551956756667215e-06, "loss": 0.8888458, "num_input_tokens_seen": 86200665, "step": 4004, "time_per_iteration": 2.646578311920166 }, { "auxiliary_loss_clip": 0.01115631, "auxiliary_loss_mlp": 0.01054344, "balance_loss_clip": 1.04529011, "balance_loss_mlp": 1.03736866, "epoch": 0.2407936269352172, "flos": 22494300986880.0, "grad_norm": 1.9965130860947515, "language_loss": 0.78239757, "learning_rate": 3.551711070585177e-06, "loss": 0.80409735, "num_input_tokens_seen": 86221640, "step": 4005, "time_per_iteration": 2.7220566272735596 }, { "auxiliary_loss_clip": 0.01090518, "auxiliary_loss_mlp": 0.01039515, "balance_loss_clip": 1.04414058, "balance_loss_mlp": 1.02164578, "epoch": 0.24085375018788516, "flos": 18551129754240.0, "grad_norm": 1.6390993289809686, "language_loss": 0.79391652, "learning_rate": 3.5514653256620995e-06, "loss": 0.8152169, "num_input_tokens_seen": 86240795, "step": 4006, "time_per_iteration": 2.7188642024993896 }, { "auxiliary_loss_clip": 0.01130191, "auxiliary_loss_mlp": 0.00777161, "balance_loss_clip": 1.0482645, "balance_loss_mlp": 1.00115335, "epoch": 0.24091387344055312, "flos": 24170539023360.0, "grad_norm": 1.6765272633695874, "language_loss": 0.71939242, "learning_rate": 3.551219521907302e-06, "loss": 0.73846585, "num_input_tokens_seen": 86262000, "step": 4007, "time_per_iteration": 4.3504638671875 }, { "auxiliary_loss_clip": 0.01101925, "auxiliary_loss_mlp": 0.01047677, "balance_loss_clip": 1.04589975, "balance_loss_mlp": 1.03132153, "epoch": 0.24097399669322112, "flos": 11036319615360.0, "grad_norm": 1.6891966370612705, "language_loss": 0.76460171, "learning_rate": 3.5509736593301042e-06, "loss": 0.78609765, "num_input_tokens_seen": 86279680, "step": 4008, "time_per_iteration": 2.700744152069092 }, { "auxiliary_loss_clip": 0.01136495, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.05069256, "balance_loss_mlp": 1.02192402, "epoch": 0.24103411994588908, "flos": 17165085696000.0, "grad_norm": 2.427830882471808, "language_loss": 0.74601823, "learning_rate": 3.5507277379398295e-06, "loss": 0.76777172, "num_input_tokens_seen": 86297180, "step": 4009, "time_per_iteration": 2.6175808906555176 }, { "auxiliary_loss_clip": 0.01134079, "auxiliary_loss_mlp": 0.01041957, "balance_loss_clip": 1.05032861, "balance_loss_mlp": 1.02532756, "epoch": 0.24109424319855705, "flos": 20667956014080.0, "grad_norm": 1.6643292794637636, "language_loss": 0.80064976, "learning_rate": 3.550481757745804e-06, "loss": 0.82241005, "num_input_tokens_seen": 86317660, "step": 4010, "time_per_iteration": 2.680511236190796 }, { "auxiliary_loss_clip": 0.01118599, "auxiliary_loss_mlp": 0.01047241, "balance_loss_clip": 1.04658401, "balance_loss_mlp": 1.02779818, "epoch": 0.241154366451225, "flos": 28181796485760.0, "grad_norm": 3.8737422865874245, "language_loss": 0.70889425, "learning_rate": 3.5502357187573555e-06, "loss": 0.73055267, "num_input_tokens_seen": 86338325, "step": 4011, "time_per_iteration": 2.716404676437378 }, { "auxiliary_loss_clip": 0.01065208, "auxiliary_loss_mlp": 0.01047099, "balance_loss_clip": 1.0414176, "balance_loss_mlp": 1.02802527, "epoch": 0.24121448970389298, "flos": 21689722293120.0, "grad_norm": 1.675052333388822, "language_loss": 0.69279736, "learning_rate": 3.5499896209838118e-06, "loss": 0.71392041, "num_input_tokens_seen": 86357615, "step": 4012, "time_per_iteration": 2.804694890975952 }, { "auxiliary_loss_clip": 0.01138123, "auxiliary_loss_mlp": 0.0104149, "balance_loss_clip": 1.05126536, "balance_loss_mlp": 1.02213097, "epoch": 0.24127461295656094, "flos": 39676191269760.0, "grad_norm": 1.5084253296098848, "language_loss": 0.732813, "learning_rate": 3.5497434644345073e-06, "loss": 0.75460911, "num_input_tokens_seen": 86380355, "step": 4013, "time_per_iteration": 2.8192849159240723 }, { "auxiliary_loss_clip": 0.01148497, "auxiliary_loss_mlp": 0.01037798, "balance_loss_clip": 1.05201018, "balance_loss_mlp": 1.02044141, "epoch": 0.2413347362092289, "flos": 19135863256320.0, "grad_norm": 1.8372553923739565, "language_loss": 0.88272971, "learning_rate": 3.5494972491187753e-06, "loss": 0.90459263, "num_input_tokens_seen": 86399125, "step": 4014, "time_per_iteration": 2.6029160022735596 }, { "auxiliary_loss_clip": 0.0111397, "auxiliary_loss_mlp": 0.01046282, "balance_loss_clip": 1.04315281, "balance_loss_mlp": 1.0278163, "epoch": 0.2413948594618969, "flos": 26939430829440.0, "grad_norm": 1.9589493379590102, "language_loss": 0.94862974, "learning_rate": 3.549250975045952e-06, "loss": 0.97023225, "num_input_tokens_seen": 86418625, "step": 4015, "time_per_iteration": 2.6958773136138916 }, { "auxiliary_loss_clip": 0.01120117, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.04570341, "balance_loss_mlp": 1.02331638, "epoch": 0.24145498271456486, "flos": 25228108183680.0, "grad_norm": 1.5486712647521637, "language_loss": 0.8271699, "learning_rate": 3.5490046422253768e-06, "loss": 0.84878188, "num_input_tokens_seen": 86438375, "step": 4016, "time_per_iteration": 2.7045071125030518 }, { "auxiliary_loss_clip": 0.01098573, "auxiliary_loss_mlp": 0.01045564, "balance_loss_clip": 1.04334974, "balance_loss_mlp": 1.02838039, "epoch": 0.24151510596723283, "flos": 40661759617920.0, "grad_norm": 1.8022012115417119, "language_loss": 0.69207114, "learning_rate": 3.54875825066639e-06, "loss": 0.71351254, "num_input_tokens_seen": 86463230, "step": 4017, "time_per_iteration": 2.8596649169921875 }, { "auxiliary_loss_clip": 0.01141299, "auxiliary_loss_mlp": 0.01051243, "balance_loss_clip": 1.05106175, "balance_loss_mlp": 1.03278995, "epoch": 0.2415752292199008, "flos": 18146667634560.0, "grad_norm": 1.6419835865444041, "language_loss": 0.84953403, "learning_rate": 3.5485118003783353e-06, "loss": 0.87145936, "num_input_tokens_seen": 86481230, "step": 4018, "time_per_iteration": 2.627629518508911 }, { "auxiliary_loss_clip": 0.01046489, "auxiliary_loss_mlp": 0.01014362, "balance_loss_clip": 1.02139664, "balance_loss_mlp": 1.01140559, "epoch": 0.24163535247256876, "flos": 67288409792640.0, "grad_norm": 0.8221446343976555, "language_loss": 0.60642469, "learning_rate": 3.548265291370558e-06, "loss": 0.62703323, "num_input_tokens_seen": 86541260, "step": 4019, "time_per_iteration": 3.269498586654663 }, { "auxiliary_loss_clip": 0.01114983, "auxiliary_loss_mlp": 0.01049089, "balance_loss_clip": 1.04582107, "balance_loss_mlp": 1.0312674, "epoch": 0.24169547572523672, "flos": 24929941386240.0, "grad_norm": 1.8826005215725077, "language_loss": 0.73324752, "learning_rate": 3.5480187236524055e-06, "loss": 0.75488818, "num_input_tokens_seen": 86559580, "step": 4020, "time_per_iteration": 2.7341055870056152 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01040515, "balance_loss_clip": 1.04833841, "balance_loss_mlp": 1.02315772, "epoch": 0.24175559897790472, "flos": 18728312567040.0, "grad_norm": 1.7964731743776612, "language_loss": 0.81617332, "learning_rate": 3.5477720972332285e-06, "loss": 0.83768916, "num_input_tokens_seen": 86577560, "step": 4021, "time_per_iteration": 2.7154345512390137 }, { "auxiliary_loss_clip": 0.01149117, "auxiliary_loss_mlp": 0.01050015, "balance_loss_clip": 1.04972911, "balance_loss_mlp": 1.03070307, "epoch": 0.24181572223057268, "flos": 23039281111680.0, "grad_norm": 2.078765142897874, "language_loss": 0.76601863, "learning_rate": 3.547525412122378e-06, "loss": 0.78800994, "num_input_tokens_seen": 86595350, "step": 4022, "time_per_iteration": 2.622262716293335 }, { "auxiliary_loss_clip": 0.01102927, "auxiliary_loss_mlp": 0.01053151, "balance_loss_clip": 1.042714, "balance_loss_mlp": 1.03271914, "epoch": 0.24187584548324065, "flos": 20376145923840.0, "grad_norm": 1.7360501926549048, "language_loss": 0.75283015, "learning_rate": 3.5472786683292083e-06, "loss": 0.774391, "num_input_tokens_seen": 86614805, "step": 4023, "time_per_iteration": 2.7339353561401367 }, { "auxiliary_loss_clip": 0.01121416, "auxiliary_loss_mlp": 0.01047921, "balance_loss_clip": 1.04916334, "balance_loss_mlp": 1.0309217, "epoch": 0.2419359687359086, "flos": 21397517153280.0, "grad_norm": 2.4319797200103466, "language_loss": 0.82542646, "learning_rate": 3.5470318658630766e-06, "loss": 0.84711981, "num_input_tokens_seen": 86633700, "step": 4024, "time_per_iteration": 2.6887242794036865 }, { "auxiliary_loss_clip": 0.01133297, "auxiliary_loss_mlp": 0.01047865, "balance_loss_clip": 1.05029452, "balance_loss_mlp": 1.03038907, "epoch": 0.24199609198857658, "flos": 18369385914240.0, "grad_norm": 1.7776330743080708, "language_loss": 0.85974258, "learning_rate": 3.5467850047333424e-06, "loss": 0.88155425, "num_input_tokens_seen": 86650905, "step": 4025, "time_per_iteration": 2.7049782276153564 }, { "auxiliary_loss_clip": 0.01092706, "auxiliary_loss_mlp": 0.01064486, "balance_loss_clip": 1.04161918, "balance_loss_mlp": 1.04456651, "epoch": 0.24205621524124454, "flos": 19463871277440.0, "grad_norm": 1.8800874250001207, "language_loss": 0.71681315, "learning_rate": 3.546538084949365e-06, "loss": 0.73838508, "num_input_tokens_seen": 86669185, "step": 4026, "time_per_iteration": 2.7773284912109375 }, { "auxiliary_loss_clip": 0.01135992, "auxiliary_loss_mlp": 0.01046992, "balance_loss_clip": 1.05109096, "balance_loss_mlp": 1.03088713, "epoch": 0.2421163384939125, "flos": 14976330451200.0, "grad_norm": 1.967847260356932, "language_loss": 0.64436764, "learning_rate": 3.546291106520509e-06, "loss": 0.66619748, "num_input_tokens_seen": 86686805, "step": 4027, "time_per_iteration": 2.6143524646759033 }, { "auxiliary_loss_clip": 0.01136637, "auxiliary_loss_mlp": 0.00775283, "balance_loss_clip": 1.05106425, "balance_loss_mlp": 1.00103092, "epoch": 0.2421764617465805, "flos": 18662057930880.0, "grad_norm": 3.6118562291520813, "language_loss": 0.70909715, "learning_rate": 3.5460440694561388e-06, "loss": 0.72821641, "num_input_tokens_seen": 86705520, "step": 4028, "time_per_iteration": 2.656334400177002 }, { "auxiliary_loss_clip": 0.01053475, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.02715707, "balance_loss_mlp": 1.04756165, "epoch": 0.24223658499924847, "flos": 64347327164160.0, "grad_norm": 0.865443083354021, "language_loss": 0.55302447, "learning_rate": 3.545796973765623e-06, "loss": 0.57405978, "num_input_tokens_seen": 86767320, "step": 4029, "time_per_iteration": 3.1736607551574707 }, { "auxiliary_loss_clip": 0.0113268, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.04679179, "balance_loss_mlp": 1.03252554, "epoch": 0.24229670825191643, "flos": 25775243124480.0, "grad_norm": 1.6290009052774777, "language_loss": 0.74065894, "learning_rate": 3.54554981945833e-06, "loss": 0.76249647, "num_input_tokens_seen": 86788110, "step": 4030, "time_per_iteration": 2.644153118133545 }, { "auxiliary_loss_clip": 0.01146282, "auxiliary_loss_mlp": 0.01053008, "balance_loss_clip": 1.04945433, "balance_loss_mlp": 1.03495932, "epoch": 0.2423568315045844, "flos": 20667094087680.0, "grad_norm": 2.044571760348203, "language_loss": 0.76492965, "learning_rate": 3.5453026065436343e-06, "loss": 0.78692257, "num_input_tokens_seen": 86807640, "step": 4031, "time_per_iteration": 2.608718156814575 }, { "auxiliary_loss_clip": 0.01130345, "auxiliary_loss_mlp": 0.00776083, "balance_loss_clip": 1.04857934, "balance_loss_mlp": 1.00130129, "epoch": 0.24241695475725236, "flos": 22416805393920.0, "grad_norm": 2.367928778009572, "language_loss": 0.65578043, "learning_rate": 3.5450553350309083e-06, "loss": 0.67484468, "num_input_tokens_seen": 86826795, "step": 4032, "time_per_iteration": 2.713796377182007 }, { "auxiliary_loss_clip": 0.01128183, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.04551542, "balance_loss_mlp": 1.02591443, "epoch": 0.24247707800992033, "flos": 17128995505920.0, "grad_norm": 2.055558599382263, "language_loss": 0.81589901, "learning_rate": 3.5448080049295286e-06, "loss": 0.83761466, "num_input_tokens_seen": 86843175, "step": 4033, "time_per_iteration": 2.6381332874298096 }, { "auxiliary_loss_clip": 0.01101134, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 1.04264998, "balance_loss_mlp": 1.02450657, "epoch": 0.2425372012625883, "flos": 31613743399680.0, "grad_norm": 2.655330103252085, "language_loss": 0.68830204, "learning_rate": 3.5445606162488754e-06, "loss": 0.70973849, "num_input_tokens_seen": 86863185, "step": 4034, "time_per_iteration": 2.8269567489624023 }, { "auxiliary_loss_clip": 0.01129717, "auxiliary_loss_mlp": 0.01036472, "balance_loss_clip": 1.05142426, "balance_loss_mlp": 1.01839972, "epoch": 0.24259732451525629, "flos": 16326032924160.0, "grad_norm": 2.305872962411053, "language_loss": 0.96432853, "learning_rate": 3.5443131689983283e-06, "loss": 0.98599035, "num_input_tokens_seen": 86880040, "step": 4035, "time_per_iteration": 2.687131643295288 }, { "auxiliary_loss_clip": 0.01116249, "auxiliary_loss_mlp": 0.01051012, "balance_loss_clip": 1.0467937, "balance_loss_mlp": 1.03419125, "epoch": 0.24265744776792425, "flos": 22856639431680.0, "grad_norm": 1.5931877581057647, "language_loss": 0.7820307, "learning_rate": 3.5440656631872715e-06, "loss": 0.80370331, "num_input_tokens_seen": 86900610, "step": 4036, "time_per_iteration": 2.7576112747192383 }, { "auxiliary_loss_clip": 0.01137826, "auxiliary_loss_mlp": 0.01049747, "balance_loss_clip": 1.05010104, "balance_loss_mlp": 1.03141224, "epoch": 0.24271757102059222, "flos": 21871573873920.0, "grad_norm": 1.6332934168141529, "language_loss": 0.74266672, "learning_rate": 3.5438180988250898e-06, "loss": 0.76454246, "num_input_tokens_seen": 86919385, "step": 4037, "time_per_iteration": 2.7860629558563232 }, { "auxiliary_loss_clip": 0.01100993, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.04173183, "balance_loss_mlp": 1.02453303, "epoch": 0.24277769427326018, "flos": 19208582340480.0, "grad_norm": 8.14050816007968, "language_loss": 0.76632005, "learning_rate": 3.543570475921171e-06, "loss": 0.78775871, "num_input_tokens_seen": 86938885, "step": 4038, "time_per_iteration": 2.691695213317871 }, { "auxiliary_loss_clip": 0.01129874, "auxiliary_loss_mlp": 0.01043604, "balance_loss_clip": 1.04768467, "balance_loss_mlp": 1.0249598, "epoch": 0.24283781752592815, "flos": 19499889640320.0, "grad_norm": 3.2334161052349817, "language_loss": 0.71992457, "learning_rate": 3.543322794484905e-06, "loss": 0.7416594, "num_input_tokens_seen": 86957705, "step": 4039, "time_per_iteration": 4.128135442733765 }, { "auxiliary_loss_clip": 0.0112766, "auxiliary_loss_mlp": 0.01048109, "balance_loss_clip": 1.04597354, "balance_loss_mlp": 1.02921474, "epoch": 0.2428979407785961, "flos": 19902196944000.0, "grad_norm": 1.6158763194283545, "language_loss": 0.78655136, "learning_rate": 3.5430750545256843e-06, "loss": 0.80830908, "num_input_tokens_seen": 86975845, "step": 4040, "time_per_iteration": 4.174723863601685 }, { "auxiliary_loss_clip": 0.01090567, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.04526615, "balance_loss_mlp": 1.02268124, "epoch": 0.2429580640312641, "flos": 24715878284160.0, "grad_norm": 2.432557236688664, "language_loss": 0.80599713, "learning_rate": 3.5428272560529027e-06, "loss": 0.8272925, "num_input_tokens_seen": 86994800, "step": 4041, "time_per_iteration": 2.7933273315429688 }, { "auxiliary_loss_clip": 0.01108653, "auxiliary_loss_mlp": 0.01044101, "balance_loss_clip": 1.04587245, "balance_loss_mlp": 1.02733982, "epoch": 0.24301818728393207, "flos": 25630343660160.0, "grad_norm": 1.9967913274059828, "language_loss": 0.76708287, "learning_rate": 3.542579399075957e-06, "loss": 0.78861034, "num_input_tokens_seen": 87016845, "step": 4042, "time_per_iteration": 4.336673021316528 }, { "auxiliary_loss_clip": 0.01056541, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.04354727, "balance_loss_mlp": 1.01928389, "epoch": 0.24307831053660003, "flos": 26141388410880.0, "grad_norm": 1.8431659047813937, "language_loss": 0.81232125, "learning_rate": 3.542331483604246e-06, "loss": 0.83324039, "num_input_tokens_seen": 87036270, "step": 4043, "time_per_iteration": 2.9156856536865234 }, { "auxiliary_loss_clip": 0.01126576, "auxiliary_loss_mlp": 0.01038857, "balance_loss_clip": 1.04610896, "balance_loss_mlp": 1.02012897, "epoch": 0.243138433789268, "flos": 14972415868800.0, "grad_norm": 2.052349433785912, "language_loss": 0.73095596, "learning_rate": 3.5420835096471706e-06, "loss": 0.75261033, "num_input_tokens_seen": 87049920, "step": 4044, "time_per_iteration": 2.6324286460876465 }, { "auxiliary_loss_clip": 0.0113453, "auxiliary_loss_mlp": 0.01042417, "balance_loss_clip": 1.04967666, "balance_loss_mlp": 1.02445269, "epoch": 0.24319855704193596, "flos": 25191694771200.0, "grad_norm": 1.8848950918191658, "language_loss": 0.83676481, "learning_rate": 3.5418354772141337e-06, "loss": 0.85853434, "num_input_tokens_seen": 87068230, "step": 4045, "time_per_iteration": 2.68994402885437 }, { "auxiliary_loss_clip": 0.010753, "auxiliary_loss_mlp": 0.01047988, "balance_loss_clip": 1.04608011, "balance_loss_mlp": 1.03117943, "epoch": 0.24325868029460393, "flos": 22127221946880.0, "grad_norm": 1.9701839557075844, "language_loss": 0.86895847, "learning_rate": 3.541587386314541e-06, "loss": 0.89019132, "num_input_tokens_seen": 87086435, "step": 4046, "time_per_iteration": 2.908737897872925 }, { "auxiliary_loss_clip": 0.01120714, "auxiliary_loss_mlp": 0.01038682, "balance_loss_clip": 1.04705977, "balance_loss_mlp": 1.02070522, "epoch": 0.2433188035472719, "flos": 23582106420480.0, "grad_norm": 1.8855160425980928, "language_loss": 0.72759771, "learning_rate": 3.5413392369578e-06, "loss": 0.74919164, "num_input_tokens_seen": 87105340, "step": 4047, "time_per_iteration": 4.310218095779419 }, { "auxiliary_loss_clip": 0.01124014, "auxiliary_loss_mlp": 0.01045256, "balance_loss_clip": 1.04447186, "balance_loss_mlp": 1.02637279, "epoch": 0.2433789267999399, "flos": 24462815990400.0, "grad_norm": 2.592486480291502, "language_loss": 0.73029542, "learning_rate": 3.5410910291533213e-06, "loss": 0.75198811, "num_input_tokens_seen": 87125780, "step": 4048, "time_per_iteration": 2.699544668197632 }, { "auxiliary_loss_clip": 0.01112707, "auxiliary_loss_mlp": 0.01045312, "balance_loss_clip": 1.04923105, "balance_loss_mlp": 1.02869391, "epoch": 0.24343905005260785, "flos": 16727909264640.0, "grad_norm": 1.921127999919884, "language_loss": 0.73616529, "learning_rate": 3.5408427629105155e-06, "loss": 0.7577455, "num_input_tokens_seen": 87144470, "step": 4049, "time_per_iteration": 2.6988370418548584 }, { "auxiliary_loss_clip": 0.01093349, "auxiliary_loss_mlp": 0.01041657, "balance_loss_clip": 1.04289758, "balance_loss_mlp": 1.02583802, "epoch": 0.24349917330527582, "flos": 20043756443520.0, "grad_norm": 2.073976648883723, "language_loss": 0.7377705, "learning_rate": 3.5405944382387985e-06, "loss": 0.75912058, "num_input_tokens_seen": 87162830, "step": 4050, "time_per_iteration": 2.718212604522705 }, { "auxiliary_loss_clip": 0.01116995, "auxiliary_loss_mlp": 0.01043968, "balance_loss_clip": 1.04518783, "balance_loss_mlp": 1.02800608, "epoch": 0.24355929655794378, "flos": 17420554200960.0, "grad_norm": 2.361179977901575, "language_loss": 0.75518602, "learning_rate": 3.5403460551475854e-06, "loss": 0.77679563, "num_input_tokens_seen": 87180905, "step": 4051, "time_per_iteration": 2.6522655487060547 }, { "auxiliary_loss_clip": 0.01092567, "auxiliary_loss_mlp": 0.01042511, "balance_loss_clip": 1.04197812, "balance_loss_mlp": 1.02507067, "epoch": 0.24361941981061175, "flos": 25410929431680.0, "grad_norm": 2.2644912923037985, "language_loss": 0.70717591, "learning_rate": 3.540097613646296e-06, "loss": 0.72852671, "num_input_tokens_seen": 87202290, "step": 4052, "time_per_iteration": 2.794059991836548 }, { "auxiliary_loss_clip": 0.0111622, "auxiliary_loss_mlp": 0.01045494, "balance_loss_clip": 1.04823005, "balance_loss_mlp": 1.02833986, "epoch": 0.2436795430632797, "flos": 22820800636800.0, "grad_norm": 1.7022998331113812, "language_loss": 0.80989587, "learning_rate": 3.539849113744351e-06, "loss": 0.83151299, "num_input_tokens_seen": 87221650, "step": 4053, "time_per_iteration": 2.682805299758911 }, { "auxiliary_loss_clip": 0.01148244, "auxiliary_loss_mlp": 0.01038109, "balance_loss_clip": 1.05124915, "balance_loss_mlp": 1.0210743, "epoch": 0.2437396663159477, "flos": 15157786982400.0, "grad_norm": 1.5338885161808513, "language_loss": 0.77628779, "learning_rate": 3.539600555451172e-06, "loss": 0.79815125, "num_input_tokens_seen": 87238515, "step": 4054, "time_per_iteration": 2.635181427001953 }, { "auxiliary_loss_clip": 0.01095192, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.04067969, "balance_loss_mlp": 1.03783989, "epoch": 0.24379978956861567, "flos": 22091131756800.0, "grad_norm": 1.8808929031646056, "language_loss": 0.84398115, "learning_rate": 3.5393519387761866e-06, "loss": 0.86548549, "num_input_tokens_seen": 87256290, "step": 4055, "time_per_iteration": 2.757601261138916 }, { "auxiliary_loss_clip": 0.01110063, "auxiliary_loss_mlp": 0.01045315, "balance_loss_clip": 1.04298997, "balance_loss_mlp": 1.02767169, "epoch": 0.24385991282128364, "flos": 31467766527360.0, "grad_norm": 2.5636936013515776, "language_loss": 0.55038011, "learning_rate": 3.5391032637288217e-06, "loss": 0.57193393, "num_input_tokens_seen": 87277085, "step": 4056, "time_per_iteration": 2.7788894176483154 }, { "auxiliary_loss_clip": 0.0113756, "auxiliary_loss_mlp": 0.01046233, "balance_loss_clip": 1.04897046, "balance_loss_mlp": 1.02876842, "epoch": 0.2439200360739516, "flos": 23838795987840.0, "grad_norm": 2.64902132986976, "language_loss": 0.80583262, "learning_rate": 3.538854530318506e-06, "loss": 0.82767057, "num_input_tokens_seen": 87293020, "step": 4057, "time_per_iteration": 2.78110671043396 }, { "auxiliary_loss_clip": 0.01132987, "auxiliary_loss_mlp": 0.01048497, "balance_loss_clip": 1.04877245, "balance_loss_mlp": 1.03145027, "epoch": 0.24398015932661957, "flos": 19169978198400.0, "grad_norm": 1.8133503864036424, "language_loss": 0.79202968, "learning_rate": 3.538605738554673e-06, "loss": 0.81384456, "num_input_tokens_seen": 87311445, "step": 4058, "time_per_iteration": 2.6609115600585938 }, { "auxiliary_loss_clip": 0.01147749, "auxiliary_loss_mlp": 0.01045059, "balance_loss_clip": 1.04827118, "balance_loss_mlp": 1.02920449, "epoch": 0.24404028257928753, "flos": 25262474520960.0, "grad_norm": 3.3482411666646086, "language_loss": 0.85503888, "learning_rate": 3.538356888446756e-06, "loss": 0.87696695, "num_input_tokens_seen": 87332055, "step": 4059, "time_per_iteration": 2.724241256713867 }, { "auxiliary_loss_clip": 0.01126127, "auxiliary_loss_mlp": 0.01038967, "balance_loss_clip": 1.04837418, "balance_loss_mlp": 1.02296889, "epoch": 0.2441004058319555, "flos": 26467600752000.0, "grad_norm": 2.2060888459440617, "language_loss": 0.7483452, "learning_rate": 3.5381079800041913e-06, "loss": 0.76999605, "num_input_tokens_seen": 87351295, "step": 4060, "time_per_iteration": 2.6769304275512695 }, { "auxiliary_loss_clip": 0.01111679, "auxiliary_loss_mlp": 0.01051445, "balance_loss_clip": 1.04629493, "balance_loss_mlp": 1.03247917, "epoch": 0.2441605290846235, "flos": 26760524163840.0, "grad_norm": 2.624850134940939, "language_loss": 0.73482168, "learning_rate": 3.5378590132364182e-06, "loss": 0.75645292, "num_input_tokens_seen": 87370650, "step": 4061, "time_per_iteration": 2.7570559978485107 }, { "auxiliary_loss_clip": 0.01144554, "auxiliary_loss_mlp": 0.01039707, "balance_loss_clip": 1.05180097, "balance_loss_mlp": 1.02394772, "epoch": 0.24422065233729146, "flos": 21105850717440.0, "grad_norm": 4.11905418985837, "language_loss": 0.76135921, "learning_rate": 3.5376099881528768e-06, "loss": 0.78320187, "num_input_tokens_seen": 87389020, "step": 4062, "time_per_iteration": 2.6387689113616943 }, { "auxiliary_loss_clip": 0.01104974, "auxiliary_loss_mlp": 0.01041222, "balance_loss_clip": 1.04618907, "balance_loss_mlp": 1.02458024, "epoch": 0.24428077558995942, "flos": 25263156879360.0, "grad_norm": 2.5628995075758954, "language_loss": 0.85376853, "learning_rate": 3.537360904763011e-06, "loss": 0.87523055, "num_input_tokens_seen": 87409695, "step": 4063, "time_per_iteration": 2.7785301208496094 }, { "auxiliary_loss_clip": 0.01119987, "auxiliary_loss_mlp": 0.01047158, "balance_loss_clip": 1.04776239, "balance_loss_mlp": 1.02789354, "epoch": 0.24434089884262739, "flos": 20485278420480.0, "grad_norm": 2.760332484942286, "language_loss": 0.6845879, "learning_rate": 3.5371117630762656e-06, "loss": 0.70625937, "num_input_tokens_seen": 87428250, "step": 4064, "time_per_iteration": 2.6691763401031494 }, { "auxiliary_loss_clip": 0.01138225, "auxiliary_loss_mlp": 0.01046639, "balance_loss_clip": 1.04773867, "balance_loss_mlp": 1.02892423, "epoch": 0.24440102209529535, "flos": 23621895711360.0, "grad_norm": 1.603702751214229, "language_loss": 0.70247531, "learning_rate": 3.536862563102088e-06, "loss": 0.72432399, "num_input_tokens_seen": 87449380, "step": 4065, "time_per_iteration": 2.6677680015563965 }, { "auxiliary_loss_clip": 0.01150465, "auxiliary_loss_mlp": 0.0104697, "balance_loss_clip": 1.05127215, "balance_loss_mlp": 1.02803993, "epoch": 0.24446114534796332, "flos": 20554729367040.0, "grad_norm": 1.788543447431289, "language_loss": 0.84282506, "learning_rate": 3.5366133048499282e-06, "loss": 0.86479944, "num_input_tokens_seen": 87465365, "step": 4066, "time_per_iteration": 2.5993456840515137 }, { "auxiliary_loss_clip": 0.01067736, "auxiliary_loss_mlp": 0.01002523, "balance_loss_clip": 1.03198457, "balance_loss_mlp": 1.00028193, "epoch": 0.24452126860063128, "flos": 60389575009920.0, "grad_norm": 0.7359455307187547, "language_loss": 0.52283657, "learning_rate": 3.5363639883292374e-06, "loss": 0.54353911, "num_input_tokens_seen": 87522525, "step": 4067, "time_per_iteration": 3.056666374206543 }, { "auxiliary_loss_clip": 0.01123042, "auxiliary_loss_mlp": 0.01045731, "balance_loss_clip": 1.04955244, "balance_loss_mlp": 1.0279212, "epoch": 0.24458139185329927, "flos": 15121660878720.0, "grad_norm": 2.6392300526537493, "language_loss": 0.7185899, "learning_rate": 3.5361146135494706e-06, "loss": 0.74027765, "num_input_tokens_seen": 87539170, "step": 4068, "time_per_iteration": 2.700847864151001 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01047493, "balance_loss_clip": 1.04378593, "balance_loss_mlp": 1.02920675, "epoch": 0.24464151510596724, "flos": 27998723842560.0, "grad_norm": 2.4202919064349744, "language_loss": 0.78083313, "learning_rate": 3.5358651805200835e-06, "loss": 0.80216813, "num_input_tokens_seen": 87558875, "step": 4069, "time_per_iteration": 2.9363162517547607 }, { "auxiliary_loss_clip": 0.01119666, "auxiliary_loss_mlp": 0.0105204, "balance_loss_clip": 1.05164659, "balance_loss_mlp": 1.03445613, "epoch": 0.2447016383586352, "flos": 19792884879360.0, "grad_norm": 4.167143793475273, "language_loss": 0.80607939, "learning_rate": 3.5356156892505347e-06, "loss": 0.82779646, "num_input_tokens_seen": 87576485, "step": 4070, "time_per_iteration": 2.658191204071045 }, { "auxiliary_loss_clip": 0.01127014, "auxiliary_loss_mlp": 0.01049283, "balance_loss_clip": 1.04832387, "balance_loss_mlp": 1.03218853, "epoch": 0.24476176161130317, "flos": 26067340523520.0, "grad_norm": 1.5316441932107319, "language_loss": 0.84351504, "learning_rate": 3.5353661397502854e-06, "loss": 0.86527801, "num_input_tokens_seen": 87598620, "step": 4071, "time_per_iteration": 2.7118849754333496 }, { "auxiliary_loss_clip": 0.01120333, "auxiliary_loss_mlp": 0.01057334, "balance_loss_clip": 1.04778695, "balance_loss_mlp": 1.03601933, "epoch": 0.24482188486397113, "flos": 18843550375680.0, "grad_norm": 1.8860726044388547, "language_loss": 0.80115497, "learning_rate": 3.535116532028798e-06, "loss": 0.82293165, "num_input_tokens_seen": 87616595, "step": 4072, "time_per_iteration": 2.6662774085998535 }, { "auxiliary_loss_clip": 0.01134806, "auxiliary_loss_mlp": 0.0104215, "balance_loss_clip": 1.05156791, "balance_loss_mlp": 1.02614021, "epoch": 0.2448820081166391, "flos": 21251791676160.0, "grad_norm": 3.990887653020168, "language_loss": 0.70466423, "learning_rate": 3.5348668660955382e-06, "loss": 0.72643375, "num_input_tokens_seen": 87635755, "step": 4073, "time_per_iteration": 2.7366209030151367 }, { "auxiliary_loss_clip": 0.01110472, "auxiliary_loss_mlp": 0.01047265, "balance_loss_clip": 1.04666865, "balance_loss_mlp": 1.03090906, "epoch": 0.2449421313693071, "flos": 23950586090880.0, "grad_norm": 2.943884117668681, "language_loss": 0.67292917, "learning_rate": 3.5346171419599728e-06, "loss": 0.69450659, "num_input_tokens_seen": 87652885, "step": 4074, "time_per_iteration": 2.7158730030059814 }, { "auxiliary_loss_clip": 0.01062567, "auxiliary_loss_mlp": 0.01002121, "balance_loss_clip": 1.02741885, "balance_loss_mlp": 0.99986744, "epoch": 0.24500225462197506, "flos": 60687669980160.0, "grad_norm": 0.8927046346070237, "language_loss": 0.68608266, "learning_rate": 3.5343673596315718e-06, "loss": 0.70672953, "num_input_tokens_seen": 87713220, "step": 4075, "time_per_iteration": 3.2283740043640137 }, { "auxiliary_loss_clip": 0.01146172, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 1.05287361, "balance_loss_mlp": 1.02612722, "epoch": 0.24506237787464302, "flos": 26284204886400.0, "grad_norm": 2.3370219869490563, "language_loss": 0.79263043, "learning_rate": 3.5341175191198063e-06, "loss": 0.81451714, "num_input_tokens_seen": 87732680, "step": 4076, "time_per_iteration": 2.6744346618652344 }, { "auxiliary_loss_clip": 0.01128421, "auxiliary_loss_mlp": 0.00775989, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.001266, "epoch": 0.245122501127311, "flos": 20552287242240.0, "grad_norm": 1.816414447330212, "language_loss": 0.81986046, "learning_rate": 3.533867620434151e-06, "loss": 0.83890456, "num_input_tokens_seen": 87751880, "step": 4077, "time_per_iteration": 2.729391098022461 }, { "auxiliary_loss_clip": 0.01148302, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.05185413, "balance_loss_mlp": 1.0288794, "epoch": 0.24518262437997895, "flos": 29132603447040.0, "grad_norm": 2.0328430965985045, "language_loss": 0.62790757, "learning_rate": 3.533617663584082e-06, "loss": 0.64986217, "num_input_tokens_seen": 87771795, "step": 4078, "time_per_iteration": 2.694767713546753 }, { "auxiliary_loss_clip": 0.01114498, "auxiliary_loss_mlp": 0.01039203, "balance_loss_clip": 1.04953861, "balance_loss_mlp": 1.02270436, "epoch": 0.24524274763264692, "flos": 23476924419840.0, "grad_norm": 1.5687748074794818, "language_loss": 0.75811553, "learning_rate": 3.5333676485790765e-06, "loss": 0.7796526, "num_input_tokens_seen": 87793640, "step": 4079, "time_per_iteration": 4.288895130157471 }, { "auxiliary_loss_clip": 0.01142871, "auxiliary_loss_mlp": 0.01047138, "balance_loss_clip": 1.04899406, "balance_loss_mlp": 1.02955461, "epoch": 0.24530287088531488, "flos": 17201175886080.0, "grad_norm": 1.8811380892336844, "language_loss": 0.74537313, "learning_rate": 3.5331175754286173e-06, "loss": 0.76727325, "num_input_tokens_seen": 87812390, "step": 4080, "time_per_iteration": 2.683969736099243 }, { "auxiliary_loss_clip": 0.01115604, "auxiliary_loss_mlp": 0.01041593, "balance_loss_clip": 1.04717278, "balance_loss_mlp": 1.02558291, "epoch": 0.24536299413798288, "flos": 14867449349760.0, "grad_norm": 2.2859558621761997, "language_loss": 0.83389306, "learning_rate": 3.532867444142186e-06, "loss": 0.85546505, "num_input_tokens_seen": 87830640, "step": 4081, "time_per_iteration": 2.772573947906494 }, { "auxiliary_loss_clip": 0.01114607, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.04734826, "balance_loss_mlp": 1.02473605, "epoch": 0.24542311739065084, "flos": 35262051886080.0, "grad_norm": 1.8658741711896472, "language_loss": 0.73223484, "learning_rate": 3.532617254729267e-06, "loss": 0.7537877, "num_input_tokens_seen": 87850450, "step": 4082, "time_per_iteration": 4.3304970264434814 }, { "auxiliary_loss_clip": 0.01104397, "auxiliary_loss_mlp": 0.01047151, "balance_loss_clip": 1.04542649, "balance_loss_mlp": 1.03163004, "epoch": 0.2454832406433188, "flos": 21503130117120.0, "grad_norm": 1.7143564189307843, "language_loss": 0.72032338, "learning_rate": 3.5323670071993485e-06, "loss": 0.74183893, "num_input_tokens_seen": 87868810, "step": 4083, "time_per_iteration": 2.7463390827178955 }, { "auxiliary_loss_clip": 0.01115479, "auxiliary_loss_mlp": 0.01048832, "balance_loss_clip": 1.04441845, "balance_loss_mlp": 1.02979386, "epoch": 0.24554336389598677, "flos": 14756664827520.0, "grad_norm": 2.556114612666859, "language_loss": 0.74363655, "learning_rate": 3.532116701561919e-06, "loss": 0.76527965, "num_input_tokens_seen": 87885685, "step": 4084, "time_per_iteration": 2.6828086376190186 }, { "auxiliary_loss_clip": 0.01126215, "auxiliary_loss_mlp": 0.01040078, "balance_loss_clip": 1.04541206, "balance_loss_mlp": 1.02269721, "epoch": 0.24560348714865474, "flos": 14976402278400.0, "grad_norm": 2.030442784512354, "language_loss": 0.85540497, "learning_rate": 3.531866337826471e-06, "loss": 0.87706792, "num_input_tokens_seen": 87903715, "step": 4085, "time_per_iteration": 4.236302852630615 }, { "auxiliary_loss_clip": 0.01110493, "auxiliary_loss_mlp": 0.01046501, "balance_loss_clip": 1.04634261, "balance_loss_mlp": 1.02932286, "epoch": 0.2456636104013227, "flos": 22675326554880.0, "grad_norm": 2.028282258660301, "language_loss": 0.78985649, "learning_rate": 3.5316159160024982e-06, "loss": 0.8114264, "num_input_tokens_seen": 87923375, "step": 4086, "time_per_iteration": 2.6638717651367188 }, { "auxiliary_loss_clip": 0.01087456, "auxiliary_loss_mlp": 0.0104508, "balance_loss_clip": 1.04792905, "balance_loss_mlp": 1.02847362, "epoch": 0.2457237336539907, "flos": 27417869009280.0, "grad_norm": 5.7080500305845865, "language_loss": 0.75053227, "learning_rate": 3.531365436099496e-06, "loss": 0.77185762, "num_input_tokens_seen": 87943115, "step": 4087, "time_per_iteration": 2.8027901649475098 }, { "auxiliary_loss_clip": 0.01090549, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.04807436, "balance_loss_mlp": 1.02680135, "epoch": 0.24578385690665866, "flos": 20412379768320.0, "grad_norm": 2.066557704160291, "language_loss": 0.79291761, "learning_rate": 3.5311148981269635e-06, "loss": 0.81427807, "num_input_tokens_seen": 87959505, "step": 4088, "time_per_iteration": 2.78812575340271 }, { "auxiliary_loss_clip": 0.0110062, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.04435658, "balance_loss_mlp": 1.01949525, "epoch": 0.24584398015932662, "flos": 23915393740800.0, "grad_norm": 1.4918864539426413, "language_loss": 0.77053773, "learning_rate": 3.5308643020944e-06, "loss": 0.79189926, "num_input_tokens_seen": 87979725, "step": 4089, "time_per_iteration": 2.75034761428833 }, { "auxiliary_loss_clip": 0.01125156, "auxiliary_loss_mlp": 0.0104201, "balance_loss_clip": 1.04609382, "balance_loss_mlp": 1.02470064, "epoch": 0.2459041034119946, "flos": 41496359103360.0, "grad_norm": 2.3383647352821737, "language_loss": 0.81814516, "learning_rate": 3.530613648011309e-06, "loss": 0.83981681, "num_input_tokens_seen": 87998270, "step": 4090, "time_per_iteration": 2.891878604888916 }, { "auxiliary_loss_clip": 0.01121872, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.04687834, "balance_loss_mlp": 1.03163147, "epoch": 0.24596422666466256, "flos": 19936814676480.0, "grad_norm": 1.8221600402702927, "language_loss": 0.73833978, "learning_rate": 3.5303629358871946e-06, "loss": 0.76005995, "num_input_tokens_seen": 88016760, "step": 4091, "time_per_iteration": 2.6410961151123047 }, { "auxiliary_loss_clip": 0.01114038, "auxiliary_loss_mlp": 0.01045509, "balance_loss_clip": 1.05517268, "balance_loss_mlp": 1.0279969, "epoch": 0.24602434991733052, "flos": 21544391865600.0, "grad_norm": 1.8983812190731213, "language_loss": 0.7706998, "learning_rate": 3.5301121657315653e-06, "loss": 0.79229522, "num_input_tokens_seen": 88036465, "step": 4092, "time_per_iteration": 2.7038323879241943 }, { "auxiliary_loss_clip": 0.01115501, "auxiliary_loss_mlp": 0.01040797, "balance_loss_clip": 1.04371238, "balance_loss_mlp": 1.02255797, "epoch": 0.24608447316999849, "flos": 23185078416000.0, "grad_norm": 3.1365051823944627, "language_loss": 0.81200075, "learning_rate": 3.5298613375539287e-06, "loss": 0.83356375, "num_input_tokens_seen": 88053270, "step": 4093, "time_per_iteration": 2.680634021759033 }, { "auxiliary_loss_clip": 0.01135527, "auxiliary_loss_mlp": 0.01043826, "balance_loss_clip": 1.04879606, "balance_loss_mlp": 1.02613521, "epoch": 0.24614459642266648, "flos": 19641951930240.0, "grad_norm": 1.9167765067224862, "language_loss": 0.86932534, "learning_rate": 3.529610451363797e-06, "loss": 0.89111882, "num_input_tokens_seen": 88072305, "step": 4094, "time_per_iteration": 2.6558003425598145 }, { "auxiliary_loss_clip": 0.01007267, "auxiliary_loss_mlp": 0.01019789, "balance_loss_clip": 1.03124738, "balance_loss_mlp": 1.01697576, "epoch": 0.24620471967533444, "flos": 61739816186880.0, "grad_norm": 0.7554163750993251, "language_loss": 0.57503664, "learning_rate": 3.5293595071706833e-06, "loss": 0.59530711, "num_input_tokens_seen": 88137995, "step": 4095, "time_per_iteration": 3.3576478958129883 }, { "auxiliary_loss_clip": 0.01051219, "auxiliary_loss_mlp": 0.0102022, "balance_loss_clip": 1.03409493, "balance_loss_mlp": 1.01790738, "epoch": 0.2462648429280024, "flos": 69154436315520.0, "grad_norm": 0.655284075812517, "language_loss": 0.56260574, "learning_rate": 3.5291085049841042e-06, "loss": 0.58332014, "num_input_tokens_seen": 88208490, "step": 4096, "time_per_iteration": 3.376516580581665 }, { "auxiliary_loss_clip": 0.0112712, "auxiliary_loss_mlp": 0.01040362, "balance_loss_clip": 1.05330801, "balance_loss_mlp": 1.0236733, "epoch": 0.24632496618067037, "flos": 29459605887360.0, "grad_norm": 1.7306008966026363, "language_loss": 0.77629399, "learning_rate": 3.5288574448135773e-06, "loss": 0.79796875, "num_input_tokens_seen": 88228050, "step": 4097, "time_per_iteration": 2.6973912715911865 }, { "auxiliary_loss_clip": 0.01114293, "auxiliary_loss_mlp": 0.01047339, "balance_loss_clip": 1.04898906, "balance_loss_mlp": 1.02842093, "epoch": 0.24638508943333834, "flos": 24316444068480.0, "grad_norm": 2.4079595240953613, "language_loss": 0.75890571, "learning_rate": 3.5286063266686235e-06, "loss": 0.78052205, "num_input_tokens_seen": 88248090, "step": 4098, "time_per_iteration": 2.739947557449341 }, { "auxiliary_loss_clip": 0.0112794, "auxiliary_loss_mlp": 0.01046194, "balance_loss_clip": 1.05179596, "balance_loss_mlp": 1.03002954, "epoch": 0.2464452126860063, "flos": 26613254401920.0, "grad_norm": 2.5671853201902737, "language_loss": 0.68179071, "learning_rate": 3.528355150558764e-06, "loss": 0.7035321, "num_input_tokens_seen": 88267545, "step": 4099, "time_per_iteration": 2.7144618034362793 }, { "auxiliary_loss_clip": 0.01133513, "auxiliary_loss_mlp": 0.01045673, "balance_loss_clip": 1.05187321, "balance_loss_mlp": 1.02897191, "epoch": 0.24650533593867427, "flos": 31212405763200.0, "grad_norm": 2.0343787496625656, "language_loss": 0.65915, "learning_rate": 3.5281039164935237e-06, "loss": 0.68094188, "num_input_tokens_seen": 88289785, "step": 4100, "time_per_iteration": 2.724008560180664 }, { "auxiliary_loss_clip": 0.01054067, "auxiliary_loss_mlp": 0.01041004, "balance_loss_clip": 1.03763318, "balance_loss_mlp": 1.03830957, "epoch": 0.24656545919134226, "flos": 68494002900480.0, "grad_norm": 0.7229502883874133, "language_loss": 0.61514676, "learning_rate": 3.5278526244824304e-06, "loss": 0.63609749, "num_input_tokens_seen": 88357320, "step": 4101, "time_per_iteration": 3.3748011589050293 }, { "auxiliary_loss_clip": 0.01144305, "auxiliary_loss_mlp": 0.01041937, "balance_loss_clip": 1.05133915, "balance_loss_mlp": 1.02455676, "epoch": 0.24662558244401023, "flos": 20084192179200.0, "grad_norm": 2.2333045722985028, "language_loss": 0.73272061, "learning_rate": 3.527601274535012e-06, "loss": 0.754583, "num_input_tokens_seen": 88377040, "step": 4102, "time_per_iteration": 2.7457518577575684 }, { "auxiliary_loss_clip": 0.01124231, "auxiliary_loss_mlp": 0.01043636, "balance_loss_clip": 1.04909408, "balance_loss_mlp": 1.02699423, "epoch": 0.2466857056966782, "flos": 30701361012480.0, "grad_norm": 2.9311552217427774, "language_loss": 0.76528364, "learning_rate": 3.5273498666608004e-06, "loss": 0.78696227, "num_input_tokens_seen": 88395085, "step": 4103, "time_per_iteration": 2.732285499572754 }, { "auxiliary_loss_clip": 0.01128751, "auxiliary_loss_mlp": 0.01051695, "balance_loss_clip": 1.04730439, "balance_loss_mlp": 1.03313375, "epoch": 0.24674582894934616, "flos": 22528523669760.0, "grad_norm": 2.3173933836652902, "language_loss": 0.78658336, "learning_rate": 3.5270984008693288e-06, "loss": 0.80838788, "num_input_tokens_seen": 88413205, "step": 4104, "time_per_iteration": 2.7234179973602295 }, { "auxiliary_loss_clip": 0.01134641, "auxiliary_loss_mlp": 0.01045411, "balance_loss_clip": 1.05110276, "balance_loss_mlp": 1.02601588, "epoch": 0.24680595220201412, "flos": 20704297599360.0, "grad_norm": 1.883953093480743, "language_loss": 0.8375451, "learning_rate": 3.526846877170133e-06, "loss": 0.85934561, "num_input_tokens_seen": 88431525, "step": 4105, "time_per_iteration": 2.7051403522491455 }, { "auxiliary_loss_clip": 0.01149885, "auxiliary_loss_mlp": 0.01051204, "balance_loss_clip": 1.05490828, "balance_loss_mlp": 1.03340602, "epoch": 0.2468660754546821, "flos": 21831174051840.0, "grad_norm": 1.9903096770852142, "language_loss": 0.76503521, "learning_rate": 3.52659529557275e-06, "loss": 0.78704607, "num_input_tokens_seen": 88451210, "step": 4106, "time_per_iteration": 2.6324243545532227 }, { "auxiliary_loss_clip": 0.01107346, "auxiliary_loss_mlp": 0.01058334, "balance_loss_clip": 1.0438261, "balance_loss_mlp": 1.03743649, "epoch": 0.24692619870735008, "flos": 15267709578240.0, "grad_norm": 2.3469304270549487, "language_loss": 0.72399199, "learning_rate": 3.5263436560867205e-06, "loss": 0.74564874, "num_input_tokens_seen": 88467790, "step": 4107, "time_per_iteration": 2.6767516136169434 }, { "auxiliary_loss_clip": 0.01149014, "auxiliary_loss_mlp": 0.01055902, "balance_loss_clip": 1.05365527, "balance_loss_mlp": 1.03840184, "epoch": 0.24698632196001805, "flos": 29680097523840.0, "grad_norm": 2.655550859638868, "language_loss": 0.65495557, "learning_rate": 3.526091958721587e-06, "loss": 0.67700469, "num_input_tokens_seen": 88490330, "step": 4108, "time_per_iteration": 2.666501760482788 }, { "auxiliary_loss_clip": 0.01095567, "auxiliary_loss_mlp": 0.01053352, "balance_loss_clip": 1.04577923, "balance_loss_mlp": 1.0351851, "epoch": 0.247046445212686, "flos": 39165469741440.0, "grad_norm": 1.631565192024798, "language_loss": 0.72685403, "learning_rate": 3.5258402034868936e-06, "loss": 0.74834323, "num_input_tokens_seen": 88512435, "step": 4109, "time_per_iteration": 2.8588712215423584 }, { "auxiliary_loss_clip": 0.01110552, "auxiliary_loss_mlp": 0.01048877, "balance_loss_clip": 1.04754984, "balance_loss_mlp": 1.03132939, "epoch": 0.24710656846535398, "flos": 22998845376000.0, "grad_norm": 1.9000447272053396, "language_loss": 0.79328829, "learning_rate": 3.5255883903921866e-06, "loss": 0.81488264, "num_input_tokens_seen": 88529780, "step": 4110, "time_per_iteration": 2.7403078079223633 }, { "auxiliary_loss_clip": 0.01114435, "auxiliary_loss_mlp": 0.0104359, "balance_loss_clip": 1.04750848, "balance_loss_mlp": 1.02536333, "epoch": 0.24716669171802194, "flos": 26432803451520.0, "grad_norm": 1.9757162932013852, "language_loss": 0.80630267, "learning_rate": 3.5253365194470144e-06, "loss": 0.82788301, "num_input_tokens_seen": 88547200, "step": 4111, "time_per_iteration": 2.6893255710601807 }, { "auxiliary_loss_clip": 0.01143907, "auxiliary_loss_mlp": 0.0104799, "balance_loss_clip": 1.0493356, "balance_loss_mlp": 1.03203976, "epoch": 0.2472268149706899, "flos": 23329870139520.0, "grad_norm": 1.928179444788623, "language_loss": 0.75401616, "learning_rate": 3.5250845906609294e-06, "loss": 0.77593511, "num_input_tokens_seen": 88566415, "step": 4112, "time_per_iteration": 2.641103506088257 }, { "auxiliary_loss_clip": 0.01112249, "auxiliary_loss_mlp": 0.00775958, "balance_loss_clip": 1.04847336, "balance_loss_mlp": 1.00114262, "epoch": 0.24728693822335787, "flos": 23768734510080.0, "grad_norm": 2.1227710866712908, "language_loss": 0.8244158, "learning_rate": 3.5248326040434835e-06, "loss": 0.84329784, "num_input_tokens_seen": 88585225, "step": 4113, "time_per_iteration": 2.831209182739258 }, { "auxiliary_loss_clip": 0.01143893, "auxiliary_loss_mlp": 0.01043423, "balance_loss_clip": 1.04927897, "balance_loss_mlp": 1.02574396, "epoch": 0.24734706147602586, "flos": 19317499355520.0, "grad_norm": 2.5263325514304813, "language_loss": 0.8704375, "learning_rate": 3.5245805596042322e-06, "loss": 0.89231074, "num_input_tokens_seen": 88603280, "step": 4114, "time_per_iteration": 2.7264626026153564 }, { "auxiliary_loss_clip": 0.01096969, "auxiliary_loss_mlp": 0.01047533, "balance_loss_clip": 1.04748011, "balance_loss_mlp": 1.03005731, "epoch": 0.24740718472869383, "flos": 28036932935040.0, "grad_norm": 1.6498261942323098, "language_loss": 0.75283766, "learning_rate": 3.524328457352734e-06, "loss": 0.77428269, "num_input_tokens_seen": 88624925, "step": 4115, "time_per_iteration": 2.755342483520508 }, { "auxiliary_loss_clip": 0.01018711, "auxiliary_loss_mlp": 0.01070163, "balance_loss_clip": 1.03186083, "balance_loss_mlp": 1.06756425, "epoch": 0.2474673079813618, "flos": 68107569408000.0, "grad_norm": 0.6879904854197085, "language_loss": 0.58123159, "learning_rate": 3.5240762972985475e-06, "loss": 0.60212028, "num_input_tokens_seen": 88691475, "step": 4116, "time_per_iteration": 3.4015462398529053 }, { "auxiliary_loss_clip": 0.01122111, "auxiliary_loss_mlp": 0.01038886, "balance_loss_clip": 1.04813063, "balance_loss_mlp": 1.02213693, "epoch": 0.24752743123402976, "flos": 29462119839360.0, "grad_norm": 19.427234883564427, "language_loss": 0.83599627, "learning_rate": 3.523824079451235e-06, "loss": 0.85760617, "num_input_tokens_seen": 88713425, "step": 4117, "time_per_iteration": 2.7881336212158203 }, { "auxiliary_loss_clip": 0.01041379, "auxiliary_loss_mlp": 0.00755386, "balance_loss_clip": 1.02616835, "balance_loss_mlp": 1.0023396, "epoch": 0.24758755448669773, "flos": 58350459824640.0, "grad_norm": 0.909523411860611, "language_loss": 0.63518536, "learning_rate": 3.5235718038203602e-06, "loss": 0.65315294, "num_input_tokens_seen": 88769995, "step": 4118, "time_per_iteration": 3.1125216484069824 }, { "auxiliary_loss_clip": 0.01126335, "auxiliary_loss_mlp": 0.01048787, "balance_loss_clip": 1.04487431, "balance_loss_mlp": 1.03127515, "epoch": 0.2476476777393657, "flos": 20484416494080.0, "grad_norm": 2.1708029437062546, "language_loss": 0.79272264, "learning_rate": 3.523319470415491e-06, "loss": 0.81447387, "num_input_tokens_seen": 88789970, "step": 4119, "time_per_iteration": 6.294121503829956 }, { "auxiliary_loss_clip": 0.01133521, "auxiliary_loss_mlp": 0.01044138, "balance_loss_clip": 1.05223441, "balance_loss_mlp": 1.02707899, "epoch": 0.24770780099203366, "flos": 20485853038080.0, "grad_norm": 1.7395275513138477, "language_loss": 0.74590164, "learning_rate": 3.5230670792461943e-06, "loss": 0.76767826, "num_input_tokens_seen": 88810000, "step": 4120, "time_per_iteration": 2.6947290897369385 }, { "auxiliary_loss_clip": 0.01135162, "auxiliary_loss_mlp": 0.01051636, "balance_loss_clip": 1.04963648, "balance_loss_mlp": 1.03435111, "epoch": 0.24776792424470165, "flos": 15153405523200.0, "grad_norm": 3.32651820696464, "language_loss": 0.88006538, "learning_rate": 3.522814630322041e-06, "loss": 0.90193337, "num_input_tokens_seen": 88827515, "step": 4121, "time_per_iteration": 4.181556224822998 }, { "auxiliary_loss_clip": 0.01147178, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.05039763, "balance_loss_mlp": 1.02431381, "epoch": 0.2478280474973696, "flos": 21725453347200.0, "grad_norm": 2.0457274986343204, "language_loss": 0.69676709, "learning_rate": 3.5225621236526045e-06, "loss": 0.71866482, "num_input_tokens_seen": 88845025, "step": 4122, "time_per_iteration": 2.7041239738464355 }, { "auxiliary_loss_clip": 0.01147132, "auxiliary_loss_mlp": 0.01045532, "balance_loss_clip": 1.05045271, "balance_loss_mlp": 1.02655339, "epoch": 0.24788817075003758, "flos": 20412200200320.0, "grad_norm": 2.4058135017179976, "language_loss": 0.8026911, "learning_rate": 3.5223095592474596e-06, "loss": 0.82461774, "num_input_tokens_seen": 88861740, "step": 4123, "time_per_iteration": 2.6154532432556152 }, { "auxiliary_loss_clip": 0.01085408, "auxiliary_loss_mlp": 0.0105298, "balance_loss_clip": 1.04720712, "balance_loss_mlp": 1.0354923, "epoch": 0.24794829400270554, "flos": 22594455083520.0, "grad_norm": 2.2195758993023578, "language_loss": 0.74967635, "learning_rate": 3.5220569371161846e-06, "loss": 0.77106017, "num_input_tokens_seen": 88879740, "step": 4124, "time_per_iteration": 2.787986993789673 }, { "auxiliary_loss_clip": 0.01131947, "auxiliary_loss_mlp": 0.01044392, "balance_loss_clip": 1.04892588, "balance_loss_mlp": 1.02809608, "epoch": 0.2480084172553735, "flos": 39676047615360.0, "grad_norm": 1.4128536066198873, "language_loss": 0.73432529, "learning_rate": 3.521804257268357e-06, "loss": 0.75608873, "num_input_tokens_seen": 88904095, "step": 4125, "time_per_iteration": 4.472416162490845 }, { "auxiliary_loss_clip": 0.01109646, "auxiliary_loss_mlp": 0.00776697, "balance_loss_clip": 1.04420686, "balance_loss_mlp": 1.00122678, "epoch": 0.24806854050804147, "flos": 22053712763520.0, "grad_norm": 1.9607758383710057, "language_loss": 0.69630861, "learning_rate": 3.5215515197135595e-06, "loss": 0.71517205, "num_input_tokens_seen": 88920740, "step": 4126, "time_per_iteration": 2.7412056922912598 }, { "auxiliary_loss_clip": 0.01133758, "auxiliary_loss_mlp": 0.01051914, "balance_loss_clip": 1.047984, "balance_loss_mlp": 1.03331721, "epoch": 0.24812866376070947, "flos": 15486764670720.0, "grad_norm": 2.275786464609162, "language_loss": 0.81219494, "learning_rate": 3.5212987244613764e-06, "loss": 0.83405173, "num_input_tokens_seen": 88938510, "step": 4127, "time_per_iteration": 2.620143413543701 }, { "auxiliary_loss_clip": 0.01136685, "auxiliary_loss_mlp": 0.00775421, "balance_loss_clip": 1.04974318, "balance_loss_mlp": 1.00120401, "epoch": 0.24818878701337743, "flos": 14757419013120.0, "grad_norm": 6.503475382998669, "language_loss": 0.8435086, "learning_rate": 3.5210458715213927e-06, "loss": 0.86262965, "num_input_tokens_seen": 88955235, "step": 4128, "time_per_iteration": 2.6764745712280273 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01057179, "balance_loss_clip": 1.04831362, "balance_loss_mlp": 1.03814149, "epoch": 0.2482489102660454, "flos": 27089501852160.0, "grad_norm": 7.318299516736359, "language_loss": 0.6572547, "learning_rate": 3.5207929609031973e-06, "loss": 0.67900276, "num_input_tokens_seen": 88975210, "step": 4129, "time_per_iteration": 2.7178256511688232 }, { "auxiliary_loss_clip": 0.01098796, "auxiliary_loss_mlp": 0.01044421, "balance_loss_clip": 1.04595077, "balance_loss_mlp": 1.02570498, "epoch": 0.24830903351871336, "flos": 26467528924800.0, "grad_norm": 1.8507928533331595, "language_loss": 0.7496134, "learning_rate": 3.5205399926163806e-06, "loss": 0.77104557, "num_input_tokens_seen": 88996120, "step": 4130, "time_per_iteration": 2.82098126411438 }, { "auxiliary_loss_clip": 0.01078173, "auxiliary_loss_mlp": 0.01050295, "balance_loss_clip": 1.04238284, "balance_loss_mlp": 1.03163934, "epoch": 0.24836915677138133, "flos": 10228436870400.0, "grad_norm": 2.098795320061471, "language_loss": 0.7680133, "learning_rate": 3.520286966670535e-06, "loss": 0.78929794, "num_input_tokens_seen": 89008685, "step": 4131, "time_per_iteration": 2.7543740272521973 }, { "auxiliary_loss_clip": 0.0113176, "auxiliary_loss_mlp": 0.0104424, "balance_loss_clip": 1.04992545, "balance_loss_mlp": 1.02781272, "epoch": 0.2484292800240493, "flos": 30080429579520.0, "grad_norm": 2.181098565661814, "language_loss": 0.83579504, "learning_rate": 3.520033883075255e-06, "loss": 0.85755503, "num_input_tokens_seen": 89031160, "step": 4132, "time_per_iteration": 2.681339979171753 }, { "auxiliary_loss_clip": 0.01120332, "auxiliary_loss_mlp": 0.01043901, "balance_loss_clip": 1.04574823, "balance_loss_mlp": 1.02506626, "epoch": 0.24848940327671726, "flos": 13442944803840.0, "grad_norm": 1.8557605687682572, "language_loss": 0.71320271, "learning_rate": 3.5197807418401386e-06, "loss": 0.73484504, "num_input_tokens_seen": 89047235, "step": 4133, "time_per_iteration": 2.6573541164398193 }, { "auxiliary_loss_clip": 0.01150987, "auxiliary_loss_mlp": 0.0104789, "balance_loss_clip": 1.05105197, "balance_loss_mlp": 1.02624202, "epoch": 0.24854952652938525, "flos": 19970247260160.0, "grad_norm": 3.222598228665933, "language_loss": 0.61894202, "learning_rate": 3.5195275429747834e-06, "loss": 0.64093071, "num_input_tokens_seen": 89064790, "step": 4134, "time_per_iteration": 2.5639493465423584 }, { "auxiliary_loss_clip": 0.01135356, "auxiliary_loss_mlp": 0.01045434, "balance_loss_clip": 1.04877877, "balance_loss_mlp": 1.02764797, "epoch": 0.24860964978205322, "flos": 18150187167360.0, "grad_norm": 1.882175713893398, "language_loss": 0.78382719, "learning_rate": 3.5192742864887914e-06, "loss": 0.80563509, "num_input_tokens_seen": 89083250, "step": 4135, "time_per_iteration": 2.6075639724731445 }, { "auxiliary_loss_clip": 0.01123928, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.05297661, "balance_loss_mlp": 1.01917946, "epoch": 0.24866977303472118, "flos": 11728641329280.0, "grad_norm": 2.4269193192884186, "language_loss": 0.83582413, "learning_rate": 3.5190209723917662e-06, "loss": 0.85742044, "num_input_tokens_seen": 89100905, "step": 4136, "time_per_iteration": 2.623377799987793 }, { "auxiliary_loss_clip": 0.01119838, "auxiliary_loss_mlp": 0.01045223, "balance_loss_clip": 1.05071807, "balance_loss_mlp": 1.02713883, "epoch": 0.24872989628738915, "flos": 34823582565120.0, "grad_norm": 2.1322549527950665, "language_loss": 0.7057327, "learning_rate": 3.518767600693314e-06, "loss": 0.72738326, "num_input_tokens_seen": 89122630, "step": 4137, "time_per_iteration": 2.814115524291992 }, { "auxiliary_loss_clip": 0.01133507, "auxiliary_loss_mlp": 0.00775347, "balance_loss_clip": 1.0449059, "balance_loss_mlp": 1.00107706, "epoch": 0.2487900195400571, "flos": 13699347062400.0, "grad_norm": 2.085766315480858, "language_loss": 0.66914427, "learning_rate": 3.518514171403042e-06, "loss": 0.68823284, "num_input_tokens_seen": 89141050, "step": 4138, "time_per_iteration": 2.646043539047241 }, { "auxiliary_loss_clip": 0.01103579, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.04612446, "balance_loss_mlp": 1.02000237, "epoch": 0.24885014279272508, "flos": 25337815297920.0, "grad_norm": 1.983116672544965, "language_loss": 0.83913636, "learning_rate": 3.51826068453056e-06, "loss": 0.86053687, "num_input_tokens_seen": 89160810, "step": 4139, "time_per_iteration": 2.741090774536133 }, { "auxiliary_loss_clip": 0.01111549, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.04586422, "balance_loss_mlp": 1.02192068, "epoch": 0.24891026604539307, "flos": 20631434860800.0, "grad_norm": 1.4951428686450043, "language_loss": 0.78923917, "learning_rate": 3.518007140085481e-06, "loss": 0.81075907, "num_input_tokens_seen": 89180610, "step": 4140, "time_per_iteration": 2.712780237197876 }, { "auxiliary_loss_clip": 0.01048621, "auxiliary_loss_mlp": 0.01096526, "balance_loss_clip": 1.02931261, "balance_loss_mlp": 1.09464228, "epoch": 0.24897038929806103, "flos": 66960294030720.0, "grad_norm": 0.8293539951671052, "language_loss": 0.61007011, "learning_rate": 3.51775353807742e-06, "loss": 0.63152146, "num_input_tokens_seen": 89241880, "step": 4141, "time_per_iteration": 3.240020513534546 }, { "auxiliary_loss_clip": 0.01147379, "auxiliary_loss_mlp": 0.01049841, "balance_loss_clip": 1.05116534, "balance_loss_mlp": 1.03240097, "epoch": 0.249030512550729, "flos": 36392555612160.0, "grad_norm": 2.1246942361961025, "language_loss": 0.72794569, "learning_rate": 3.5174998785159913e-06, "loss": 0.74991786, "num_input_tokens_seen": 89263340, "step": 4142, "time_per_iteration": 2.7316160202026367 }, { "auxiliary_loss_clip": 0.01133287, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.04780602, "balance_loss_mlp": 1.02705276, "epoch": 0.24909063580339696, "flos": 20154576879360.0, "grad_norm": 1.7635050074541005, "language_loss": 0.80630821, "learning_rate": 3.5172461614108157e-06, "loss": 0.82808483, "num_input_tokens_seen": 89282870, "step": 4143, "time_per_iteration": 2.6763389110565186 }, { "auxiliary_loss_clip": 0.01117552, "auxiliary_loss_mlp": 0.01036613, "balance_loss_clip": 1.04615402, "balance_loss_mlp": 1.02026916, "epoch": 0.24915075905606493, "flos": 26396569607040.0, "grad_norm": 2.7235452599944145, "language_loss": 0.59766376, "learning_rate": 3.5169923867715137e-06, "loss": 0.61920542, "num_input_tokens_seen": 89303830, "step": 4144, "time_per_iteration": 2.789417266845703 }, { "auxiliary_loss_clip": 0.01128344, "auxiliary_loss_mlp": 0.01045393, "balance_loss_clip": 1.04464769, "balance_loss_mlp": 1.02850127, "epoch": 0.2492108823087329, "flos": 27527216987520.0, "grad_norm": 2.1754585056135047, "language_loss": 0.78476733, "learning_rate": 3.516738554607708e-06, "loss": 0.80650467, "num_input_tokens_seen": 89324350, "step": 4145, "time_per_iteration": 2.8416056632995605 }, { "auxiliary_loss_clip": 0.01140077, "auxiliary_loss_mlp": 0.00778414, "balance_loss_clip": 1.04980016, "balance_loss_mlp": 1.00122261, "epoch": 0.24927100556140086, "flos": 16691388111360.0, "grad_norm": 2.035933799021365, "language_loss": 0.64925039, "learning_rate": 3.5164846649290253e-06, "loss": 0.66843534, "num_input_tokens_seen": 89342875, "step": 4146, "time_per_iteration": 2.818240165710449 }, { "auxiliary_loss_clip": 0.01036642, "auxiliary_loss_mlp": 0.0100618, "balance_loss_clip": 1.02582741, "balance_loss_mlp": 1.00403452, "epoch": 0.24933112881406885, "flos": 62772464286720.0, "grad_norm": 0.9560925601012792, "language_loss": 0.67304933, "learning_rate": 3.5162307177450915e-06, "loss": 0.69347757, "num_input_tokens_seen": 89404925, "step": 4147, "time_per_iteration": 3.339989185333252 }, { "auxiliary_loss_clip": 0.01123141, "auxiliary_loss_mlp": 0.0104863, "balance_loss_clip": 1.04991198, "balance_loss_mlp": 1.03078485, "epoch": 0.24939125206673682, "flos": 26651894457600.0, "grad_norm": 2.4221411280533554, "language_loss": 0.89285177, "learning_rate": 3.5159767130655366e-06, "loss": 0.9145695, "num_input_tokens_seen": 89425090, "step": 4148, "time_per_iteration": 2.7497105598449707 }, { "auxiliary_loss_clip": 0.01098234, "auxiliary_loss_mlp": 0.01049718, "balance_loss_clip": 1.04725289, "balance_loss_mlp": 1.02874899, "epoch": 0.24945137531940478, "flos": 20704333512960.0, "grad_norm": 1.90098046882646, "language_loss": 0.68272161, "learning_rate": 3.5157226508999935e-06, "loss": 0.70420116, "num_input_tokens_seen": 89442615, "step": 4149, "time_per_iteration": 2.7739884853363037 }, { "auxiliary_loss_clip": 0.01134907, "auxiliary_loss_mlp": 0.01044357, "balance_loss_clip": 1.0508213, "balance_loss_mlp": 1.02747655, "epoch": 0.24951149857207275, "flos": 23768662682880.0, "grad_norm": 1.67166255010053, "language_loss": 0.71424097, "learning_rate": 3.515468531258095e-06, "loss": 0.73603356, "num_input_tokens_seen": 89463025, "step": 4150, "time_per_iteration": 2.6801233291625977 }, { "auxiliary_loss_clip": 0.01098898, "auxiliary_loss_mlp": 0.0104939, "balance_loss_clip": 1.04628861, "balance_loss_mlp": 1.03149676, "epoch": 0.2495716218247407, "flos": 15664881237120.0, "grad_norm": 4.371450104119659, "language_loss": 0.72732216, "learning_rate": 3.515214354149478e-06, "loss": 0.74880505, "num_input_tokens_seen": 89480225, "step": 4151, "time_per_iteration": 2.7118351459503174 }, { "auxiliary_loss_clip": 0.01142805, "auxiliary_loss_mlp": 0.01054095, "balance_loss_clip": 1.05117846, "balance_loss_mlp": 1.0357486, "epoch": 0.24963174507740868, "flos": 24052499953920.0, "grad_norm": 3.4200711789217397, "language_loss": 0.63707078, "learning_rate": 3.514960119583781e-06, "loss": 0.65903974, "num_input_tokens_seen": 89496985, "step": 4152, "time_per_iteration": 2.6352219581604004 }, { "auxiliary_loss_clip": 0.01128057, "auxiliary_loss_mlp": 0.01043812, "balance_loss_clip": 1.05110407, "balance_loss_mlp": 1.02628791, "epoch": 0.24969186833007664, "flos": 21799501234560.0, "grad_norm": 3.664579624689737, "language_loss": 0.77259195, "learning_rate": 3.514705827570645e-06, "loss": 0.79431069, "num_input_tokens_seen": 89514420, "step": 4153, "time_per_iteration": 2.6120872497558594 }, { "auxiliary_loss_clip": 0.01135035, "auxiliary_loss_mlp": 0.01042909, "balance_loss_clip": 1.05221617, "balance_loss_mlp": 1.02620757, "epoch": 0.24975199158274464, "flos": 19938143479680.0, "grad_norm": 2.5781435797973833, "language_loss": 0.7677725, "learning_rate": 3.514451478119711e-06, "loss": 0.78955191, "num_input_tokens_seen": 89532925, "step": 4154, "time_per_iteration": 2.7488853931427 }, { "auxiliary_loss_clip": 0.0113655, "auxiliary_loss_mlp": 0.01051969, "balance_loss_clip": 1.05146766, "balance_loss_mlp": 1.03251421, "epoch": 0.2498121148354126, "flos": 25338389915520.0, "grad_norm": 1.9052782276095375, "language_loss": 0.70335877, "learning_rate": 3.5141970712406258e-06, "loss": 0.72524405, "num_input_tokens_seen": 89552855, "step": 4155, "time_per_iteration": 2.6622395515441895 }, { "auxiliary_loss_clip": 0.01127695, "auxiliary_loss_mlp": 0.01047805, "balance_loss_clip": 1.05243564, "balance_loss_mlp": 1.03074658, "epoch": 0.24987223808808057, "flos": 20558787603840.0, "grad_norm": 1.6974192026095432, "language_loss": 0.74953228, "learning_rate": 3.513942606943036e-06, "loss": 0.77128726, "num_input_tokens_seen": 89572830, "step": 4156, "time_per_iteration": 2.7599329948425293 }, { "auxiliary_loss_clip": 0.01127061, "auxiliary_loss_mlp": 0.01040498, "balance_loss_clip": 1.04922485, "balance_loss_mlp": 1.02404737, "epoch": 0.24993236134074853, "flos": 19749037351680.0, "grad_norm": 2.6541448192787858, "language_loss": 0.76703429, "learning_rate": 3.513688085236591e-06, "loss": 0.78870988, "num_input_tokens_seen": 89590345, "step": 4157, "time_per_iteration": 4.172720432281494 }, { "auxiliary_loss_clip": 0.01087279, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.04686046, "balance_loss_mlp": 1.03302717, "epoch": 0.2499924845934165, "flos": 18770292587520.0, "grad_norm": 6.508490360255271, "language_loss": 0.81656492, "learning_rate": 3.513433506130942e-06, "loss": 0.83794451, "num_input_tokens_seen": 89610295, "step": 4158, "time_per_iteration": 4.373260736465454 }, { "auxiliary_loss_clip": 0.01115824, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.04740119, "balance_loss_mlp": 1.02166879, "epoch": 0.25005260784608446, "flos": 16872198197760.0, "grad_norm": 2.799032697181286, "language_loss": 0.76568067, "learning_rate": 3.5131788696357427e-06, "loss": 0.78723395, "num_input_tokens_seen": 89627795, "step": 4159, "time_per_iteration": 2.6529338359832764 }, { "auxiliary_loss_clip": 0.01139337, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.05149508, "balance_loss_mlp": 1.02013946, "epoch": 0.2501127310987524, "flos": 22124923476480.0, "grad_norm": 2.4918403268433122, "language_loss": 0.71557873, "learning_rate": 3.512924175760649e-06, "loss": 0.73735791, "num_input_tokens_seen": 89648090, "step": 4160, "time_per_iteration": 4.178418874740601 }, { "auxiliary_loss_clip": 0.01062459, "auxiliary_loss_mlp": 0.01001923, "balance_loss_clip": 1.02823949, "balance_loss_mlp": 0.99992067, "epoch": 0.2501728543514204, "flos": 69458061980160.0, "grad_norm": 0.7611682305123987, "language_loss": 0.56783372, "learning_rate": 3.5126694245153186e-06, "loss": 0.58847755, "num_input_tokens_seen": 89710345, "step": 4161, "time_per_iteration": 3.1690969467163086 }, { "auxiliary_loss_clip": 0.0114076, "auxiliary_loss_mlp": 0.01048659, "balance_loss_clip": 1.05206347, "balance_loss_mlp": 1.0308131, "epoch": 0.25023297760408836, "flos": 16289978647680.0, "grad_norm": 4.523737291621751, "language_loss": 0.80654883, "learning_rate": 3.5124146159094125e-06, "loss": 0.82844305, "num_input_tokens_seen": 89729390, "step": 4162, "time_per_iteration": 2.630491018295288 }, { "auxiliary_loss_clip": 0.01127145, "auxiliary_loss_mlp": 0.00776859, "balance_loss_clip": 1.04807281, "balance_loss_mlp": 1.00124371, "epoch": 0.2502931008567563, "flos": 12237998140800.0, "grad_norm": 3.0029202967601107, "language_loss": 0.87312925, "learning_rate": 3.5121597499525927e-06, "loss": 0.89216936, "num_input_tokens_seen": 89742805, "step": 4163, "time_per_iteration": 2.660985231399536 }, { "auxiliary_loss_clip": 0.01133331, "auxiliary_loss_mlp": 0.01039671, "balance_loss_clip": 1.0538981, "balance_loss_mlp": 1.02234972, "epoch": 0.25035322410942434, "flos": 23181882105600.0, "grad_norm": 1.700690076898522, "language_loss": 0.83170879, "learning_rate": 3.5119048266545232e-06, "loss": 0.85343885, "num_input_tokens_seen": 89761145, "step": 4164, "time_per_iteration": 4.217406988143921 }, { "auxiliary_loss_clip": 0.01131608, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.05681539, "balance_loss_mlp": 1.0309732, "epoch": 0.2504133473620923, "flos": 20917534688640.0, "grad_norm": 1.61687510361108, "language_loss": 0.73889691, "learning_rate": 3.5116498460248716e-06, "loss": 0.76068473, "num_input_tokens_seen": 89780905, "step": 4165, "time_per_iteration": 2.7395150661468506 }, { "auxiliary_loss_clip": 0.01112927, "auxiliary_loss_mlp": 0.01043589, "balance_loss_clip": 1.04912043, "balance_loss_mlp": 1.02611279, "epoch": 0.2504734706147603, "flos": 20776549806720.0, "grad_norm": 1.856982928728685, "language_loss": 0.74739552, "learning_rate": 3.5113948080733062e-06, "loss": 0.7689606, "num_input_tokens_seen": 89799230, "step": 4166, "time_per_iteration": 2.7567081451416016 }, { "auxiliary_loss_clip": 0.01110594, "auxiliary_loss_mlp": 0.01042647, "balance_loss_clip": 1.04968488, "balance_loss_mlp": 1.02651834, "epoch": 0.25053359386742824, "flos": 24349373861760.0, "grad_norm": 2.0013578528528724, "language_loss": 0.82254446, "learning_rate": 3.5111397128094973e-06, "loss": 0.84407687, "num_input_tokens_seen": 89818240, "step": 4167, "time_per_iteration": 2.692664384841919 }, { "auxiliary_loss_clip": 0.01130059, "auxiliary_loss_mlp": 0.01043694, "balance_loss_clip": 1.05185139, "balance_loss_mlp": 1.02695727, "epoch": 0.2505937171200962, "flos": 21214336769280.0, "grad_norm": 2.4392619558537407, "language_loss": 0.79381847, "learning_rate": 3.51088456024312e-06, "loss": 0.81555605, "num_input_tokens_seen": 89834485, "step": 4168, "time_per_iteration": 2.6286962032318115 }, { "auxiliary_loss_clip": 0.01138966, "auxiliary_loss_mlp": 0.01046302, "balance_loss_clip": 1.05118442, "balance_loss_mlp": 1.02704966, "epoch": 0.25065384037276417, "flos": 41427231379200.0, "grad_norm": 2.2753262043393243, "language_loss": 0.69603884, "learning_rate": 3.510629350383849e-06, "loss": 0.71789157, "num_input_tokens_seen": 89855645, "step": 4169, "time_per_iteration": 2.7935590744018555 }, { "auxiliary_loss_clip": 0.01110761, "auxiliary_loss_mlp": 0.01049625, "balance_loss_clip": 1.04870963, "balance_loss_mlp": 1.03274524, "epoch": 0.25071396362543213, "flos": 26102389219200.0, "grad_norm": 1.8250030020409629, "language_loss": 0.78045398, "learning_rate": 3.510374083241361e-06, "loss": 0.80205786, "num_input_tokens_seen": 89874895, "step": 4170, "time_per_iteration": 2.7728679180145264 }, { "auxiliary_loss_clip": 0.01128286, "auxiliary_loss_mlp": 0.01043437, "balance_loss_clip": 1.05320668, "balance_loss_mlp": 1.02662849, "epoch": 0.2507740868781001, "flos": 19098982967040.0, "grad_norm": 2.5073993684848004, "language_loss": 0.76440209, "learning_rate": 3.5101187588253368e-06, "loss": 0.78611928, "num_input_tokens_seen": 89891700, "step": 4171, "time_per_iteration": 2.7825160026550293 }, { "auxiliary_loss_clip": 0.01061117, "auxiliary_loss_mlp": 0.01002396, "balance_loss_clip": 1.027282, "balance_loss_mlp": 1.00034571, "epoch": 0.25083421013076806, "flos": 64341868296960.0, "grad_norm": 0.8424544393272001, "language_loss": 0.6006161, "learning_rate": 3.509863377145458e-06, "loss": 0.62125123, "num_input_tokens_seen": 89955775, "step": 4172, "time_per_iteration": 3.1981940269470215 }, { "auxiliary_loss_clip": 0.01125517, "auxiliary_loss_mlp": 0.01046213, "balance_loss_clip": 1.05005789, "balance_loss_mlp": 1.02821243, "epoch": 0.25089433338343603, "flos": 24279599692800.0, "grad_norm": 1.4368714421460043, "language_loss": 0.79106563, "learning_rate": 3.509607938211409e-06, "loss": 0.81278288, "num_input_tokens_seen": 89977150, "step": 4173, "time_per_iteration": 2.8311028480529785 }, { "auxiliary_loss_clip": 0.01152553, "auxiliary_loss_mlp": 0.0104675, "balance_loss_clip": 1.05725241, "balance_loss_mlp": 1.02986968, "epoch": 0.250954456636104, "flos": 14721472477440.0, "grad_norm": 2.103663042812158, "language_loss": 0.83371937, "learning_rate": 3.509352442032875e-06, "loss": 0.85571229, "num_input_tokens_seen": 89994925, "step": 4174, "time_per_iteration": 2.696199893951416 }, { "auxiliary_loss_clip": 0.01095749, "auxiliary_loss_mlp": 0.01049206, "balance_loss_clip": 1.04728913, "balance_loss_mlp": 1.03095484, "epoch": 0.25101457988877196, "flos": 22273593868800.0, "grad_norm": 43.022796554959484, "language_loss": 0.71023381, "learning_rate": 3.509096888619545e-06, "loss": 0.73168337, "num_input_tokens_seen": 90013235, "step": 4175, "time_per_iteration": 2.8337926864624023 }, { "auxiliary_loss_clip": 0.01119154, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.05135846, "balance_loss_mlp": 1.02145982, "epoch": 0.2510747031414399, "flos": 25188929424000.0, "grad_norm": 2.017414900854033, "language_loss": 0.80957019, "learning_rate": 3.50884127798111e-06, "loss": 0.83115101, "num_input_tokens_seen": 90032150, "step": 4176, "time_per_iteration": 2.936908483505249 }, { "auxiliary_loss_clip": 0.01127542, "auxiliary_loss_mlp": 0.0104611, "balance_loss_clip": 1.0535233, "balance_loss_mlp": 1.02753711, "epoch": 0.25113482639410795, "flos": 20704189858560.0, "grad_norm": 2.475574978330162, "language_loss": 0.82294285, "learning_rate": 3.5085856101272623e-06, "loss": 0.84467936, "num_input_tokens_seen": 90049085, "step": 4177, "time_per_iteration": 2.7630460262298584 }, { "auxiliary_loss_clip": 0.01110202, "auxiliary_loss_mlp": 0.01051495, "balance_loss_clip": 1.05168724, "balance_loss_mlp": 1.03386414, "epoch": 0.2511949496467759, "flos": 21506936958720.0, "grad_norm": 2.1761277698635593, "language_loss": 0.82517993, "learning_rate": 3.508329885067698e-06, "loss": 0.84679693, "num_input_tokens_seen": 90067695, "step": 4178, "time_per_iteration": 2.7356274127960205 }, { "auxiliary_loss_clip": 0.01145101, "auxiliary_loss_mlp": 0.00775573, "balance_loss_clip": 1.05324888, "balance_loss_mlp": 1.00148535, "epoch": 0.2512550728994439, "flos": 20701999128960.0, "grad_norm": 2.1475299559000947, "language_loss": 0.75229692, "learning_rate": 3.508074102812112e-06, "loss": 0.77150369, "num_input_tokens_seen": 90083890, "step": 4179, "time_per_iteration": 2.631096363067627 }, { "auxiliary_loss_clip": 0.01109293, "auxiliary_loss_mlp": 0.01056583, "balance_loss_clip": 1.04920673, "balance_loss_mlp": 1.03833175, "epoch": 0.25131519615211184, "flos": 18478626151680.0, "grad_norm": 1.9599833122138943, "language_loss": 0.69976825, "learning_rate": 3.507818263370206e-06, "loss": 0.72142696, "num_input_tokens_seen": 90100995, "step": 4180, "time_per_iteration": 2.708122730255127 }, { "auxiliary_loss_clip": 0.01147992, "auxiliary_loss_mlp": 0.01045783, "balance_loss_clip": 1.05343485, "balance_loss_mlp": 1.02909422, "epoch": 0.2513753194047798, "flos": 20484955198080.0, "grad_norm": 1.8622914556591927, "language_loss": 0.85940182, "learning_rate": 3.5075623667516796e-06, "loss": 0.88133955, "num_input_tokens_seen": 90120365, "step": 4181, "time_per_iteration": 2.633091449737549 }, { "auxiliary_loss_clip": 0.01148017, "auxiliary_loss_mlp": 0.01049707, "balance_loss_clip": 1.05351245, "balance_loss_mlp": 1.03270781, "epoch": 0.25143544265744777, "flos": 37670077704960.0, "grad_norm": 2.0695978407502467, "language_loss": 0.6856631, "learning_rate": 3.507306412966238e-06, "loss": 0.70764029, "num_input_tokens_seen": 90142610, "step": 4182, "time_per_iteration": 2.8169894218444824 }, { "auxiliary_loss_clip": 0.01041202, "auxiliary_loss_mlp": 0.010083, "balance_loss_clip": 1.02456141, "balance_loss_mlp": 1.00577307, "epoch": 0.25149556591011574, "flos": 69367457923200.0, "grad_norm": 0.8403189096432666, "language_loss": 0.70032597, "learning_rate": 3.5070504020235853e-06, "loss": 0.72082102, "num_input_tokens_seen": 90200555, "step": 4183, "time_per_iteration": 3.2070610523223877 }, { "auxiliary_loss_clip": 0.01130203, "auxiliary_loss_mlp": 0.01042834, "balance_loss_clip": 1.05145216, "balance_loss_mlp": 1.02441609, "epoch": 0.2515556891627837, "flos": 13990402967040.0, "grad_norm": 1.8802113118438855, "language_loss": 0.73834902, "learning_rate": 3.506794333933431e-06, "loss": 0.76007938, "num_input_tokens_seen": 90218120, "step": 4184, "time_per_iteration": 2.691950559616089 }, { "auxiliary_loss_clip": 0.01136971, "auxiliary_loss_mlp": 0.01047362, "balance_loss_clip": 1.05233765, "balance_loss_mlp": 1.0297792, "epoch": 0.25161581241545167, "flos": 22163527618560.0, "grad_norm": 1.8676646084141537, "language_loss": 0.8334859, "learning_rate": 3.506538208705484e-06, "loss": 0.85532916, "num_input_tokens_seen": 90236790, "step": 4185, "time_per_iteration": 2.6931228637695312 }, { "auxiliary_loss_clip": 0.01022217, "auxiliary_loss_mlp": 0.01010846, "balance_loss_clip": 1.03471541, "balance_loss_mlp": 1.00902176, "epoch": 0.25167593566811963, "flos": 69358407696000.0, "grad_norm": 0.7883550117667959, "language_loss": 0.61448294, "learning_rate": 3.5062820263494574e-06, "loss": 0.63481361, "num_input_tokens_seen": 90297070, "step": 4186, "time_per_iteration": 3.175295829772949 }, { "auxiliary_loss_clip": 0.01107804, "auxiliary_loss_mlp": 0.01041844, "balance_loss_clip": 1.04873872, "balance_loss_mlp": 1.02405787, "epoch": 0.2517360589207876, "flos": 13261452359040.0, "grad_norm": 1.8553357788385085, "language_loss": 0.79070914, "learning_rate": 3.5060257868750656e-06, "loss": 0.81220555, "num_input_tokens_seen": 90315255, "step": 4187, "time_per_iteration": 2.887378215789795 }, { "auxiliary_loss_clip": 0.01091434, "auxiliary_loss_mlp": 0.01049489, "balance_loss_clip": 1.0482558, "balance_loss_mlp": 1.03138089, "epoch": 0.25179618217345556, "flos": 20376828282240.0, "grad_norm": 3.7749228259968586, "language_loss": 0.79629189, "learning_rate": 3.5057694902920244e-06, "loss": 0.8177011, "num_input_tokens_seen": 90334990, "step": 4188, "time_per_iteration": 2.8985629081726074 }, { "auxiliary_loss_clip": 0.01133381, "auxiliary_loss_mlp": 0.01046993, "balance_loss_clip": 1.05168021, "balance_loss_mlp": 1.03012538, "epoch": 0.25185630542612353, "flos": 27664718250240.0, "grad_norm": 1.7363151422402578, "language_loss": 0.74419165, "learning_rate": 3.5055131366100534e-06, "loss": 0.76599538, "num_input_tokens_seen": 90351825, "step": 4189, "time_per_iteration": 2.697097063064575 }, { "auxiliary_loss_clip": 0.01118534, "auxiliary_loss_mlp": 0.01044827, "balance_loss_clip": 1.04871011, "balance_loss_mlp": 1.02862656, "epoch": 0.25191642867879155, "flos": 20996430912000.0, "grad_norm": 2.0536634388060078, "language_loss": 0.84721291, "learning_rate": 3.5052567258388745e-06, "loss": 0.86884648, "num_input_tokens_seen": 90369860, "step": 4190, "time_per_iteration": 2.731227397918701 }, { "auxiliary_loss_clip": 0.01118209, "auxiliary_loss_mlp": 0.01044895, "balance_loss_clip": 1.04597688, "balance_loss_mlp": 1.02633369, "epoch": 0.2519765519314595, "flos": 21105671149440.0, "grad_norm": 2.0130913170662783, "language_loss": 0.75695485, "learning_rate": 3.5050002579882082e-06, "loss": 0.77858591, "num_input_tokens_seen": 90389245, "step": 4191, "time_per_iteration": 2.7403173446655273 }, { "auxiliary_loss_clip": 0.01048031, "auxiliary_loss_mlp": 0.01014765, "balance_loss_clip": 1.02375531, "balance_loss_mlp": 1.0122261, "epoch": 0.2520366751841275, "flos": 62744993360640.0, "grad_norm": 0.7280864395517058, "language_loss": 0.57129633, "learning_rate": 3.5047437330677823e-06, "loss": 0.59192419, "num_input_tokens_seen": 90456735, "step": 4192, "time_per_iteration": 3.237478017807007 }, { "auxiliary_loss_clip": 0.01121978, "auxiliary_loss_mlp": 0.01041578, "balance_loss_clip": 1.05535698, "balance_loss_mlp": 1.02374434, "epoch": 0.25209679843679544, "flos": 22230716008320.0, "grad_norm": 1.8423117439969312, "language_loss": 0.76066267, "learning_rate": 3.504487151087323e-06, "loss": 0.78229821, "num_input_tokens_seen": 90474165, "step": 4193, "time_per_iteration": 2.699486255645752 }, { "auxiliary_loss_clip": 0.01137884, "auxiliary_loss_mlp": 0.01046125, "balance_loss_clip": 1.05232048, "balance_loss_mlp": 1.02869618, "epoch": 0.2521569216894634, "flos": 12166643773440.0, "grad_norm": 3.5003037089711437, "language_loss": 0.84335077, "learning_rate": 3.5042305120565598e-06, "loss": 0.86519086, "num_input_tokens_seen": 90491660, "step": 4194, "time_per_iteration": 2.6561896800994873 }, { "auxiliary_loss_clip": 0.01149932, "auxiliary_loss_mlp": 0.01050793, "balance_loss_clip": 1.05253458, "balance_loss_mlp": 1.03461599, "epoch": 0.2522170449421314, "flos": 23699786353920.0, "grad_norm": 1.3753304678825264, "language_loss": 0.88249695, "learning_rate": 3.5039738159852253e-06, "loss": 0.90450418, "num_input_tokens_seen": 90514025, "step": 4195, "time_per_iteration": 2.67887806892395 }, { "auxiliary_loss_clip": 0.01150202, "auxiliary_loss_mlp": 0.01041959, "balance_loss_clip": 1.05412734, "balance_loss_mlp": 1.02199149, "epoch": 0.25227716819479934, "flos": 20955456472320.0, "grad_norm": 2.4146072325129087, "language_loss": 0.85488242, "learning_rate": 3.503717062883053e-06, "loss": 0.87680399, "num_input_tokens_seen": 90533530, "step": 4196, "time_per_iteration": 2.6358916759490967 }, { "auxiliary_loss_clip": 0.01137804, "auxiliary_loss_mlp": 0.01049246, "balance_loss_clip": 1.05213511, "balance_loss_mlp": 1.03193665, "epoch": 0.2523372914474673, "flos": 23331342597120.0, "grad_norm": 1.9329643035636839, "language_loss": 0.8319478, "learning_rate": 3.5034602527597786e-06, "loss": 0.8538183, "num_input_tokens_seen": 90554025, "step": 4197, "time_per_iteration": 5.738839387893677 }, { "auxiliary_loss_clip": 0.01140063, "auxiliary_loss_mlp": 0.01051416, "balance_loss_clip": 1.05392218, "balance_loss_mlp": 1.03224671, "epoch": 0.25239741470013527, "flos": 36970321875840.0, "grad_norm": 2.1358917159416104, "language_loss": 0.72820318, "learning_rate": 3.5032033856251405e-06, "loss": 0.75011802, "num_input_tokens_seen": 90576930, "step": 4198, "time_per_iteration": 2.8819963932037354 }, { "auxiliary_loss_clip": 0.01152924, "auxiliary_loss_mlp": 0.01048555, "balance_loss_clip": 1.05455935, "balance_loss_mlp": 1.03045893, "epoch": 0.25245753795280323, "flos": 18515757836160.0, "grad_norm": 6.722547943004915, "language_loss": 0.76560014, "learning_rate": 3.50294646148888e-06, "loss": 0.78761488, "num_input_tokens_seen": 90595710, "step": 4199, "time_per_iteration": 2.636993169784546 }, { "auxiliary_loss_clip": 0.01125413, "auxiliary_loss_mlp": 0.00776026, "balance_loss_clip": 1.05274642, "balance_loss_mlp": 1.00117147, "epoch": 0.2525176612054712, "flos": 32344884737280.0, "grad_norm": 1.814097809936595, "language_loss": 0.73571241, "learning_rate": 3.502689480360739e-06, "loss": 0.75472683, "num_input_tokens_seen": 90617945, "step": 4200, "time_per_iteration": 4.297755002975464 }, { "auxiliary_loss_clip": 0.01137136, "auxiliary_loss_mlp": 0.01047957, "balance_loss_clip": 1.05050063, "balance_loss_mlp": 1.03187585, "epoch": 0.25257778445813917, "flos": 45258217459200.0, "grad_norm": 1.6490086858694837, "language_loss": 0.8223114, "learning_rate": 3.5024324422504616e-06, "loss": 0.84416234, "num_input_tokens_seen": 90640855, "step": 4201, "time_per_iteration": 2.859703302383423 }, { "auxiliary_loss_clip": 0.01098423, "auxiliary_loss_mlp": 0.01048, "balance_loss_clip": 1.05422068, "balance_loss_mlp": 1.03126347, "epoch": 0.25263790771080713, "flos": 23367791923200.0, "grad_norm": 1.9307441853812024, "language_loss": 0.74854887, "learning_rate": 3.5021753471677965e-06, "loss": 0.77001321, "num_input_tokens_seen": 90661350, "step": 4202, "time_per_iteration": 2.7475366592407227 }, { "auxiliary_loss_clip": 0.01134371, "auxiliary_loss_mlp": 0.01040637, "balance_loss_clip": 1.05362439, "balance_loss_mlp": 1.02392364, "epoch": 0.25269803096347515, "flos": 18515039564160.0, "grad_norm": 1.882597455778369, "language_loss": 0.7323755, "learning_rate": 3.501918195122491e-06, "loss": 0.75412554, "num_input_tokens_seen": 90680540, "step": 4203, "time_per_iteration": 2.6547653675079346 }, { "auxiliary_loss_clip": 0.01128208, "auxiliary_loss_mlp": 0.01039636, "balance_loss_clip": 1.05176711, "balance_loss_mlp": 1.02239835, "epoch": 0.2527581542161431, "flos": 24610552629120.0, "grad_norm": 1.4386036639708744, "language_loss": 0.77731073, "learning_rate": 3.501660986124297e-06, "loss": 0.79898918, "num_input_tokens_seen": 90703460, "step": 4204, "time_per_iteration": 4.4116432666778564 }, { "auxiliary_loss_clip": 0.01115267, "auxiliary_loss_mlp": 0.01052396, "balance_loss_clip": 1.05262613, "balance_loss_mlp": 1.03453815, "epoch": 0.2528182774688111, "flos": 12641275111680.0, "grad_norm": 1.9357035590368088, "language_loss": 0.72175288, "learning_rate": 3.5014037201829684e-06, "loss": 0.74342954, "num_input_tokens_seen": 90718815, "step": 4205, "time_per_iteration": 2.6750712394714355 }, { "auxiliary_loss_clip": 0.01124756, "auxiliary_loss_mlp": 0.01044172, "balance_loss_clip": 1.05032194, "balance_loss_mlp": 1.02801895, "epoch": 0.25287840072147905, "flos": 46936789879680.0, "grad_norm": 1.4680577763339375, "language_loss": 0.75594378, "learning_rate": 3.50114639730826e-06, "loss": 0.77763301, "num_input_tokens_seen": 90742125, "step": 4206, "time_per_iteration": 2.876408815383911 }, { "auxiliary_loss_clip": 0.01107683, "auxiliary_loss_mlp": 0.01044618, "balance_loss_clip": 1.04771221, "balance_loss_mlp": 1.02780974, "epoch": 0.252938523974147, "flos": 18879712392960.0, "grad_norm": 1.5378963492414741, "language_loss": 0.78807724, "learning_rate": 3.5008890175099296e-06, "loss": 0.80960023, "num_input_tokens_seen": 90760785, "step": 4207, "time_per_iteration": 2.7176475524902344 }, { "auxiliary_loss_clip": 0.01133715, "auxiliary_loss_mlp": 0.01055631, "balance_loss_clip": 1.0547328, "balance_loss_mlp": 1.03984797, "epoch": 0.252998647226815, "flos": 21434720664960.0, "grad_norm": 1.5723877129370716, "language_loss": 0.76399815, "learning_rate": 3.5006315807977375e-06, "loss": 0.78589159, "num_input_tokens_seen": 90780045, "step": 4208, "time_per_iteration": 2.797658920288086 }, { "auxiliary_loss_clip": 0.01131059, "auxiliary_loss_mlp": 0.01040866, "balance_loss_clip": 1.05162513, "balance_loss_mlp": 1.02465391, "epoch": 0.25305877047948294, "flos": 25442171285760.0, "grad_norm": 3.9595354320915166, "language_loss": 0.69848049, "learning_rate": 3.5003740871814456e-06, "loss": 0.72019976, "num_input_tokens_seen": 90797980, "step": 4209, "time_per_iteration": 2.738159418106079 }, { "auxiliary_loss_clip": 0.01046521, "auxiliary_loss_mlp": 0.0100386, "balance_loss_clip": 1.02250004, "balance_loss_mlp": 1.0015471, "epoch": 0.2531188937321509, "flos": 60185603629440.0, "grad_norm": 0.7787603502724176, "language_loss": 0.55091059, "learning_rate": 3.5001165366708175e-06, "loss": 0.57141441, "num_input_tokens_seen": 90864865, "step": 4210, "time_per_iteration": 3.196953535079956 }, { "auxiliary_loss_clip": 0.01113643, "auxiliary_loss_mlp": 0.01038759, "balance_loss_clip": 1.05103207, "balance_loss_mlp": 1.02215338, "epoch": 0.25317901698481887, "flos": 19682387665920.0, "grad_norm": 1.8504444580052586, "language_loss": 0.8006835, "learning_rate": 3.4998589292756204e-06, "loss": 0.82220757, "num_input_tokens_seen": 90882885, "step": 4211, "time_per_iteration": 2.7241647243499756 }, { "auxiliary_loss_clip": 0.01095085, "auxiliary_loss_mlp": 0.01044368, "balance_loss_clip": 1.04594803, "balance_loss_mlp": 1.02844775, "epoch": 0.25323914023748684, "flos": 24424355502720.0, "grad_norm": 1.531596575729193, "language_loss": 0.78362429, "learning_rate": 3.499601265005622e-06, "loss": 0.80501878, "num_input_tokens_seen": 90902985, "step": 4212, "time_per_iteration": 2.788607358932495 }, { "auxiliary_loss_clip": 0.01133893, "auxiliary_loss_mlp": 0.01041038, "balance_loss_clip": 1.04857254, "balance_loss_mlp": 1.02401471, "epoch": 0.2532992634901548, "flos": 25447450584960.0, "grad_norm": 2.123277134845907, "language_loss": 0.53516036, "learning_rate": 3.4993435438705938e-06, "loss": 0.55690968, "num_input_tokens_seen": 90923550, "step": 4213, "time_per_iteration": 2.6675784587860107 }, { "auxiliary_loss_clip": 0.01120924, "auxiliary_loss_mlp": 0.01044765, "balance_loss_clip": 1.05005503, "balance_loss_mlp": 1.0273726, "epoch": 0.25335938674282277, "flos": 18880538405760.0, "grad_norm": 2.4965805840577002, "language_loss": 0.65416414, "learning_rate": 3.499085765880308e-06, "loss": 0.67582107, "num_input_tokens_seen": 90943260, "step": 4214, "time_per_iteration": 2.691359281539917 }, { "auxiliary_loss_clip": 0.01046401, "auxiliary_loss_mlp": 0.01002761, "balance_loss_clip": 1.02238619, "balance_loss_mlp": 1.00056791, "epoch": 0.25341950999549073, "flos": 53062649936640.0, "grad_norm": 0.8515065776804692, "language_loss": 0.58004916, "learning_rate": 3.4988279310445396e-06, "loss": 0.60054076, "num_input_tokens_seen": 90996295, "step": 4215, "time_per_iteration": 2.981840133666992 }, { "auxiliary_loss_clip": 0.01124794, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.05316496, "balance_loss_mlp": 1.02655554, "epoch": 0.2534796332481587, "flos": 39020247054720.0, "grad_norm": 1.7497766885830588, "language_loss": 0.83251095, "learning_rate": 3.498570039373066e-06, "loss": 0.85419416, "num_input_tokens_seen": 91017545, "step": 4216, "time_per_iteration": 2.912137508392334 }, { "auxiliary_loss_clip": 0.0112972, "auxiliary_loss_mlp": 0.01040052, "balance_loss_clip": 1.05088937, "balance_loss_mlp": 1.02338624, "epoch": 0.2535397565008267, "flos": 23586990670080.0, "grad_norm": 3.3733415491927996, "language_loss": 0.80008072, "learning_rate": 3.498312090875666e-06, "loss": 0.82177842, "num_input_tokens_seen": 91037715, "step": 4217, "time_per_iteration": 2.6532363891601562 }, { "auxiliary_loss_clip": 0.01116019, "auxiliary_loss_mlp": 0.01038346, "balance_loss_clip": 1.04436612, "balance_loss_mlp": 1.02234793, "epoch": 0.2535998797534947, "flos": 19281373251840.0, "grad_norm": 2.333881972650505, "language_loss": 0.75585902, "learning_rate": 3.4980540855621218e-06, "loss": 0.77740264, "num_input_tokens_seen": 91055295, "step": 4218, "time_per_iteration": 2.650867223739624 }, { "auxiliary_loss_clip": 0.0113544, "auxiliary_loss_mlp": 0.0104021, "balance_loss_clip": 1.04940748, "balance_loss_mlp": 1.0229727, "epoch": 0.25366000300616265, "flos": 24024382583040.0, "grad_norm": 2.040148074486094, "language_loss": 0.74188256, "learning_rate": 3.4977960234422167e-06, "loss": 0.76363909, "num_input_tokens_seen": 91075485, "step": 4219, "time_per_iteration": 2.727161169052124 }, { "auxiliary_loss_clip": 0.01138406, "auxiliary_loss_mlp": 0.01048455, "balance_loss_clip": 1.05222011, "balance_loss_mlp": 1.03138447, "epoch": 0.2537201262588306, "flos": 16289368116480.0, "grad_norm": 4.990704095966988, "language_loss": 0.81355274, "learning_rate": 3.497537904525736e-06, "loss": 0.83542132, "num_input_tokens_seen": 91093620, "step": 4220, "time_per_iteration": 2.6146652698516846 }, { "auxiliary_loss_clip": 0.01100698, "auxiliary_loss_mlp": 0.01049127, "balance_loss_clip": 1.04988587, "balance_loss_mlp": 1.03041148, "epoch": 0.2537802495114986, "flos": 23294677789440.0, "grad_norm": 2.3092995740689197, "language_loss": 0.70819569, "learning_rate": 3.497279728822468e-06, "loss": 0.72969389, "num_input_tokens_seen": 91114110, "step": 4221, "time_per_iteration": 2.851747751235962 }, { "auxiliary_loss_clip": 0.0114682, "auxiliary_loss_mlp": 0.01039444, "balance_loss_clip": 1.05224657, "balance_loss_mlp": 1.02257586, "epoch": 0.25384037276416654, "flos": 17639142416640.0, "grad_norm": 2.4229893622177188, "language_loss": 0.61689377, "learning_rate": 3.497021496342202e-06, "loss": 0.63875645, "num_input_tokens_seen": 91133135, "step": 4222, "time_per_iteration": 2.6394412517547607 }, { "auxiliary_loss_clip": 0.01138378, "auxiliary_loss_mlp": 0.01051871, "balance_loss_clip": 1.05371165, "balance_loss_mlp": 1.03528929, "epoch": 0.2539004960168345, "flos": 21507044699520.0, "grad_norm": 1.6839261376783914, "language_loss": 0.74744058, "learning_rate": 3.496763207094731e-06, "loss": 0.76934308, "num_input_tokens_seen": 91151805, "step": 4223, "time_per_iteration": 2.648322105407715 }, { "auxiliary_loss_clip": 0.01092255, "auxiliary_loss_mlp": 0.01039082, "balance_loss_clip": 1.04767203, "balance_loss_mlp": 1.02325082, "epoch": 0.2539606192695025, "flos": 23950909313280.0, "grad_norm": 1.7092524284111348, "language_loss": 0.80226004, "learning_rate": 3.49650486108985e-06, "loss": 0.82357341, "num_input_tokens_seen": 91172270, "step": 4224, "time_per_iteration": 2.7572662830352783 }, { "auxiliary_loss_clip": 0.01130506, "auxiliary_loss_mlp": 0.00774076, "balance_loss_clip": 1.05102324, "balance_loss_mlp": 1.00112057, "epoch": 0.25402074252217044, "flos": 24169784837760.0, "grad_norm": 1.4497407173280796, "language_loss": 0.77330017, "learning_rate": 3.496246458337354e-06, "loss": 0.792346, "num_input_tokens_seen": 91192080, "step": 4225, "time_per_iteration": 2.7661190032958984 }, { "auxiliary_loss_clip": 0.01130647, "auxiliary_loss_mlp": 0.01049954, "balance_loss_clip": 1.04919255, "balance_loss_mlp": 1.03271639, "epoch": 0.2540808657748384, "flos": 22303758314880.0, "grad_norm": 2.0615353379683055, "language_loss": 0.84638137, "learning_rate": 3.4959879988470426e-06, "loss": 0.86818743, "num_input_tokens_seen": 91211450, "step": 4226, "time_per_iteration": 2.690683126449585 }, { "auxiliary_loss_clip": 0.01143268, "auxiliary_loss_mlp": 0.01043336, "balance_loss_clip": 1.05067408, "balance_loss_mlp": 1.02613425, "epoch": 0.25414098902750637, "flos": 27599541022080.0, "grad_norm": 1.5600656222031943, "language_loss": 0.70886129, "learning_rate": 3.4957294826287164e-06, "loss": 0.73072731, "num_input_tokens_seen": 91231835, "step": 4227, "time_per_iteration": 2.6647307872772217 }, { "auxiliary_loss_clip": 0.01055229, "auxiliary_loss_mlp": 0.01001956, "balance_loss_clip": 1.02168798, "balance_loss_mlp": 0.9995476, "epoch": 0.25420111228017434, "flos": 58170834887040.0, "grad_norm": 0.9869295588353136, "language_loss": 0.61927998, "learning_rate": 3.4954709096921785e-06, "loss": 0.63985181, "num_input_tokens_seen": 91288755, "step": 4228, "time_per_iteration": 2.986067533493042 }, { "auxiliary_loss_clip": 0.01124878, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.0464859, "balance_loss_mlp": 1.02212501, "epoch": 0.2542612355328423, "flos": 11464409905920.0, "grad_norm": 2.314170874410929, "language_loss": 0.86946094, "learning_rate": 3.4952122800472336e-06, "loss": 0.8911112, "num_input_tokens_seen": 91302485, "step": 4229, "time_per_iteration": 2.629518985748291 }, { "auxiliary_loss_clip": 0.01102882, "auxiliary_loss_mlp": 0.01042519, "balance_loss_clip": 1.04811144, "balance_loss_mlp": 1.0241369, "epoch": 0.2543213587855103, "flos": 22965879669120.0, "grad_norm": 1.7811216632522446, "language_loss": 0.77265114, "learning_rate": 3.4949535937036892e-06, "loss": 0.79410517, "num_input_tokens_seen": 91321120, "step": 4230, "time_per_iteration": 2.715655565261841 }, { "auxiliary_loss_clip": 0.01133364, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.0504818, "balance_loss_mlp": 1.03074503, "epoch": 0.2543814820381783, "flos": 18253178438400.0, "grad_norm": 1.8956341732473607, "language_loss": 0.7550717, "learning_rate": 3.4946948506713544e-06, "loss": 0.77688015, "num_input_tokens_seen": 91338575, "step": 4231, "time_per_iteration": 2.6945316791534424 }, { "auxiliary_loss_clip": 0.0113214, "auxiliary_loss_mlp": 0.01038979, "balance_loss_clip": 1.04939127, "balance_loss_mlp": 1.0230999, "epoch": 0.25444160529084625, "flos": 15632705629440.0, "grad_norm": 1.6179274617095247, "language_loss": 0.73618764, "learning_rate": 3.4944360509600416e-06, "loss": 0.75789881, "num_input_tokens_seen": 91357355, "step": 4232, "time_per_iteration": 2.6219112873077393 }, { "auxiliary_loss_clip": 0.01149145, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.05579972, "balance_loss_mlp": 1.02589035, "epoch": 0.2545017285435142, "flos": 24601610142720.0, "grad_norm": 2.2856831174377388, "language_loss": 0.86333203, "learning_rate": 3.4941771945795637e-06, "loss": 0.88525456, "num_input_tokens_seen": 91376515, "step": 4233, "time_per_iteration": 2.675877809524536 }, { "auxiliary_loss_clip": 0.01080108, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.04641938, "balance_loss_mlp": 1.02457917, "epoch": 0.2545618517961822, "flos": 24679069822080.0, "grad_norm": 1.5382450997432586, "language_loss": 0.75319451, "learning_rate": 3.493918281539737e-06, "loss": 0.77439684, "num_input_tokens_seen": 91397595, "step": 4234, "time_per_iteration": 2.9050087928771973 }, { "auxiliary_loss_clip": 0.01117427, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05171227, "balance_loss_mlp": 1.02897787, "epoch": 0.25462197504885015, "flos": 23915106432000.0, "grad_norm": 2.6382014960101765, "language_loss": 0.74923635, "learning_rate": 3.493659311850379e-06, "loss": 0.77085495, "num_input_tokens_seen": 91417775, "step": 4235, "time_per_iteration": 2.788041353225708 }, { "auxiliary_loss_clip": 0.01124445, "auxiliary_loss_mlp": 0.00776537, "balance_loss_clip": 1.05315781, "balance_loss_mlp": 1.00115323, "epoch": 0.2546820983015181, "flos": 24789387467520.0, "grad_norm": 1.9882672691222136, "language_loss": 0.64451182, "learning_rate": 3.4934002855213106e-06, "loss": 0.66352159, "num_input_tokens_seen": 91437665, "step": 4236, "time_per_iteration": 2.8649141788482666 }, { "auxiliary_loss_clip": 0.01144465, "auxiliary_loss_mlp": 0.01036249, "balance_loss_clip": 1.05185175, "balance_loss_mlp": 1.02122915, "epoch": 0.2547422215541861, "flos": 18734130570240.0, "grad_norm": 1.6410229940010734, "language_loss": 0.6714325, "learning_rate": 3.493141202562354e-06, "loss": 0.69323969, "num_input_tokens_seen": 91456705, "step": 4237, "time_per_iteration": 4.262012958526611 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01049012, "balance_loss_clip": 1.05240059, "balance_loss_mlp": 1.03203678, "epoch": 0.25480234480685404, "flos": 21032449274880.0, "grad_norm": 2.0013967295828237, "language_loss": 0.75415373, "learning_rate": 3.492882062983333e-06, "loss": 0.77611452, "num_input_tokens_seen": 91475535, "step": 4238, "time_per_iteration": 2.6378636360168457 }, { "auxiliary_loss_clip": 0.01137265, "auxiliary_loss_mlp": 0.01046047, "balance_loss_clip": 1.05366278, "balance_loss_mlp": 1.02843964, "epoch": 0.254862468059522, "flos": 25082167224960.0, "grad_norm": 3.4417299363308613, "language_loss": 0.80712521, "learning_rate": 3.492622866794074e-06, "loss": 0.82895833, "num_input_tokens_seen": 91499140, "step": 4239, "time_per_iteration": 4.348390579223633 }, { "auxiliary_loss_clip": 0.01128023, "auxiliary_loss_mlp": 0.01045872, "balance_loss_clip": 1.0522213, "balance_loss_mlp": 1.02870631, "epoch": 0.25492259131219, "flos": 20558392554240.0, "grad_norm": 1.7312526359597522, "language_loss": 0.77521586, "learning_rate": 3.492363614004407e-06, "loss": 0.79695487, "num_input_tokens_seen": 91518335, "step": 4240, "time_per_iteration": 2.7501273155212402 }, { "auxiliary_loss_clip": 0.01151347, "auxiliary_loss_mlp": 0.01040734, "balance_loss_clip": 1.05296493, "balance_loss_mlp": 1.0226146, "epoch": 0.25498271456485794, "flos": 25042485674880.0, "grad_norm": 3.3593092651087595, "language_loss": 0.83430749, "learning_rate": 3.492104304624162e-06, "loss": 0.85622829, "num_input_tokens_seen": 91537655, "step": 4241, "time_per_iteration": 2.7480928897857666 }, { "auxiliary_loss_clip": 0.01137407, "auxiliary_loss_mlp": 0.01045384, "balance_loss_clip": 1.05306387, "balance_loss_mlp": 1.02887392, "epoch": 0.2550428378175259, "flos": 26178412354560.0, "grad_norm": 1.6379574895871623, "language_loss": 0.73322648, "learning_rate": 3.4918449386631725e-06, "loss": 0.75505441, "num_input_tokens_seen": 91557545, "step": 4242, "time_per_iteration": 2.713635206222534 }, { "auxiliary_loss_clip": 0.0114709, "auxiliary_loss_mlp": 0.00774169, "balance_loss_clip": 1.05182981, "balance_loss_mlp": 1.00115824, "epoch": 0.2551029610701939, "flos": 15267170874240.0, "grad_norm": 3.2486673035230993, "language_loss": 0.72336024, "learning_rate": 3.491585516131273e-06, "loss": 0.7425729, "num_input_tokens_seen": 91574405, "step": 4243, "time_per_iteration": 4.298815727233887 }, { "auxiliary_loss_clip": 0.0113532, "auxiliary_loss_mlp": 0.01045095, "balance_loss_clip": 1.05183125, "balance_loss_mlp": 1.02797616, "epoch": 0.2551630843228619, "flos": 18112193556480.0, "grad_norm": 1.8323151946393021, "language_loss": 0.82076979, "learning_rate": 3.491326037038301e-06, "loss": 0.842574, "num_input_tokens_seen": 91593755, "step": 4244, "time_per_iteration": 2.6497015953063965 }, { "auxiliary_loss_clip": 0.01054616, "auxiliary_loss_mlp": 0.01017916, "balance_loss_clip": 1.03294289, "balance_loss_mlp": 1.01572227, "epoch": 0.25522320757552985, "flos": 70520192167680.0, "grad_norm": 0.6914168393706984, "language_loss": 0.57701397, "learning_rate": 3.4910665013940967e-06, "loss": 0.59773928, "num_input_tokens_seen": 91660335, "step": 4245, "time_per_iteration": 3.2938833236694336 }, { "auxiliary_loss_clip": 0.01146552, "auxiliary_loss_mlp": 0.01052395, "balance_loss_clip": 1.0508852, "balance_loss_mlp": 1.03577745, "epoch": 0.2552833308281978, "flos": 22893088757760.0, "grad_norm": 2.1326330958670567, "language_loss": 0.65120399, "learning_rate": 3.4908069092085015e-06, "loss": 0.6731934, "num_input_tokens_seen": 91678500, "step": 4246, "time_per_iteration": 2.5949065685272217 }, { "auxiliary_loss_clip": 0.01127579, "auxiliary_loss_mlp": 0.01044633, "balance_loss_clip": 1.04806828, "balance_loss_mlp": 1.02944601, "epoch": 0.2553434540808658, "flos": 22053605022720.0, "grad_norm": 1.7151532201527704, "language_loss": 0.81580049, "learning_rate": 3.4905472604913585e-06, "loss": 0.83752257, "num_input_tokens_seen": 91696430, "step": 4247, "time_per_iteration": 2.673624277114868 }, { "auxiliary_loss_clip": 0.01140059, "auxiliary_loss_mlp": 0.01044068, "balance_loss_clip": 1.05152941, "balance_loss_mlp": 1.02543616, "epoch": 0.25540357733353375, "flos": 16544190176640.0, "grad_norm": 2.241724474505105, "language_loss": 0.83335149, "learning_rate": 3.490287555252514e-06, "loss": 0.85519278, "num_input_tokens_seen": 91713270, "step": 4248, "time_per_iteration": 2.617570400238037 }, { "auxiliary_loss_clip": 0.01112618, "auxiliary_loss_mlp": 0.01042154, "balance_loss_clip": 1.04433584, "balance_loss_mlp": 1.02458215, "epoch": 0.2554637005862017, "flos": 17565022702080.0, "grad_norm": 2.084670538042193, "language_loss": 0.84011936, "learning_rate": 3.4900277935018166e-06, "loss": 0.8616671, "num_input_tokens_seen": 91728865, "step": 4249, "time_per_iteration": 2.6617467403411865 }, { "auxiliary_loss_clip": 0.01001275, "auxiliary_loss_mlp": 0.01002657, "balance_loss_clip": 1.0228157, "balance_loss_mlp": 0.9996174, "epoch": 0.2555238238388697, "flos": 72244763953920.0, "grad_norm": 0.765792812565725, "language_loss": 0.56274796, "learning_rate": 3.489767975249115e-06, "loss": 0.58278728, "num_input_tokens_seen": 91787470, "step": 4250, "time_per_iteration": 3.24300479888916 }, { "auxiliary_loss_clip": 0.01117816, "auxiliary_loss_mlp": 0.01036136, "balance_loss_clip": 1.04929769, "balance_loss_mlp": 1.01839769, "epoch": 0.25558394709153764, "flos": 24389414547840.0, "grad_norm": 2.294460262471245, "language_loss": 0.80566651, "learning_rate": 3.4895081005042632e-06, "loss": 0.82720602, "num_input_tokens_seen": 91805640, "step": 4251, "time_per_iteration": 2.732752561569214 }, { "auxiliary_loss_clip": 0.01030367, "auxiliary_loss_mlp": 0.01001193, "balance_loss_clip": 1.02468216, "balance_loss_mlp": 0.99888068, "epoch": 0.2556440703442056, "flos": 69231213636480.0, "grad_norm": 0.7932625116211053, "language_loss": 0.6608988, "learning_rate": 3.4892481692771146e-06, "loss": 0.68121445, "num_input_tokens_seen": 91869695, "step": 4252, "time_per_iteration": 3.304985523223877 }, { "auxiliary_loss_clip": 0.01130428, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.0499115, "balance_loss_mlp": 1.02097619, "epoch": 0.2557041935968736, "flos": 24863902231680.0, "grad_norm": 2.60951363435401, "language_loss": 0.73882902, "learning_rate": 3.4889881815775267e-06, "loss": 0.76049387, "num_input_tokens_seen": 91889920, "step": 4253, "time_per_iteration": 2.706052303314209 }, { "auxiliary_loss_clip": 0.01097964, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.04340124, "balance_loss_mlp": 1.02782309, "epoch": 0.25576431684954154, "flos": 22492110257280.0, "grad_norm": 2.978807414856607, "language_loss": 0.72565317, "learning_rate": 3.488728137415357e-06, "loss": 0.7470758, "num_input_tokens_seen": 91908665, "step": 4254, "time_per_iteration": 2.7579715251922607 }, { "auxiliary_loss_clip": 0.01098791, "auxiliary_loss_mlp": 0.00774228, "balance_loss_clip": 1.04665136, "balance_loss_mlp": 1.001104, "epoch": 0.2558244401022095, "flos": 19826748426240.0, "grad_norm": 1.7240740787107458, "language_loss": 0.80729312, "learning_rate": 3.4884680368004675e-06, "loss": 0.82602334, "num_input_tokens_seen": 91927855, "step": 4255, "time_per_iteration": 2.788978099822998 }, { "auxiliary_loss_clip": 0.01124525, "auxiliary_loss_mlp": 0.01040748, "balance_loss_clip": 1.05111384, "balance_loss_mlp": 1.02414227, "epoch": 0.2558845633548775, "flos": 23220486247680.0, "grad_norm": 1.5275751549355678, "language_loss": 0.85734111, "learning_rate": 3.488207879742721e-06, "loss": 0.87899381, "num_input_tokens_seen": 91948500, "step": 4256, "time_per_iteration": 2.7916831970214844 }, { "auxiliary_loss_clip": 0.01102599, "auxiliary_loss_mlp": 0.01049743, "balance_loss_clip": 1.04525566, "balance_loss_mlp": 1.03164732, "epoch": 0.2559446866075455, "flos": 16837867774080.0, "grad_norm": 1.8301502951270987, "language_loss": 0.74872649, "learning_rate": 3.4879476662519826e-06, "loss": 0.77024996, "num_input_tokens_seen": 91968375, "step": 4257, "time_per_iteration": 2.7754952907562256 }, { "auxiliary_loss_clip": 0.0102418, "auxiliary_loss_mlp": 0.01011535, "balance_loss_clip": 1.03534186, "balance_loss_mlp": 1.00959146, "epoch": 0.25600480986021346, "flos": 57593786895360.0, "grad_norm": 0.8003890262370261, "language_loss": 0.65255105, "learning_rate": 3.4876873963381196e-06, "loss": 0.67290819, "num_input_tokens_seen": 92028490, "step": 4258, "time_per_iteration": 3.269063949584961 }, { "auxiliary_loss_clip": 0.01091736, "auxiliary_loss_mlp": 0.00773347, "balance_loss_clip": 1.04549718, "balance_loss_mlp": 1.00111449, "epoch": 0.2560649331128814, "flos": 27819529868160.0, "grad_norm": 1.5266978755669562, "language_loss": 0.76443565, "learning_rate": 3.4874270700110013e-06, "loss": 0.78308654, "num_input_tokens_seen": 92048060, "step": 4259, "time_per_iteration": 2.805574893951416 }, { "auxiliary_loss_clip": 0.01026212, "auxiliary_loss_mlp": 0.01016368, "balance_loss_clip": 1.02208054, "balance_loss_mlp": 1.01372147, "epoch": 0.2561250563655494, "flos": 70950509101440.0, "grad_norm": 0.7927643603688844, "language_loss": 0.58455491, "learning_rate": 3.4871666872804994e-06, "loss": 0.60498071, "num_input_tokens_seen": 92118180, "step": 4260, "time_per_iteration": 3.3904550075531006 }, { "auxiliary_loss_clip": 0.01133193, "auxiliary_loss_mlp": 0.01048996, "balance_loss_clip": 1.04874313, "balance_loss_mlp": 1.03204465, "epoch": 0.25618517961821735, "flos": 27012329481600.0, "grad_norm": 3.3188145253338543, "language_loss": 0.77064955, "learning_rate": 3.4869062481564875e-06, "loss": 0.79247141, "num_input_tokens_seen": 92137570, "step": 4261, "time_per_iteration": 2.769864082336426 }, { "auxiliary_loss_clip": 0.01144035, "auxiliary_loss_mlp": 0.01040091, "balance_loss_clip": 1.05178332, "balance_loss_mlp": 1.02465355, "epoch": 0.2562453028708853, "flos": 23068296322560.0, "grad_norm": 1.5699122250769224, "language_loss": 0.83367205, "learning_rate": 3.486645752648842e-06, "loss": 0.85551333, "num_input_tokens_seen": 92157625, "step": 4262, "time_per_iteration": 2.682828426361084 }, { "auxiliary_loss_clip": 0.01134556, "auxiliary_loss_mlp": 0.01041299, "balance_loss_clip": 1.05219626, "balance_loss_mlp": 1.02344143, "epoch": 0.2563054261235533, "flos": 15120942606720.0, "grad_norm": 2.340862226505914, "language_loss": 0.73892939, "learning_rate": 3.4863852007674405e-06, "loss": 0.76068795, "num_input_tokens_seen": 92175350, "step": 4263, "time_per_iteration": 2.70947003364563 }, { "auxiliary_loss_clip": 0.0111297, "auxiliary_loss_mlp": 0.00773371, "balance_loss_clip": 1.05221081, "balance_loss_mlp": 1.00093555, "epoch": 0.25636554937622125, "flos": 27854865872640.0, "grad_norm": 1.8143922917988324, "language_loss": 0.82766259, "learning_rate": 3.486124592522163e-06, "loss": 0.84652603, "num_input_tokens_seen": 92196070, "step": 4264, "time_per_iteration": 2.7249553203582764 }, { "auxiliary_loss_clip": 0.01133012, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.05265546, "balance_loss_mlp": 1.02468669, "epoch": 0.2564256726288892, "flos": 28906509288960.0, "grad_norm": 2.8986425954305206, "language_loss": 0.74346334, "learning_rate": 3.4858639279228924e-06, "loss": 0.76521224, "num_input_tokens_seen": 92216310, "step": 4265, "time_per_iteration": 2.7149150371551514 }, { "auxiliary_loss_clip": 0.01110152, "auxiliary_loss_mlp": 0.01036531, "balance_loss_clip": 1.04754925, "balance_loss_mlp": 1.02034247, "epoch": 0.2564857958815572, "flos": 18514931823360.0, "grad_norm": 15.50909821859273, "language_loss": 0.81623137, "learning_rate": 3.485603206979513e-06, "loss": 0.83769822, "num_input_tokens_seen": 92234510, "step": 4266, "time_per_iteration": 2.6890153884887695 }, { "auxiliary_loss_clip": 0.01083702, "auxiliary_loss_mlp": 0.01050109, "balance_loss_clip": 1.0468955, "balance_loss_mlp": 1.0318346, "epoch": 0.25654591913422514, "flos": 25808280658560.0, "grad_norm": 2.4522850064786037, "language_loss": 0.79120672, "learning_rate": 3.4853424297019103e-06, "loss": 0.81254482, "num_input_tokens_seen": 92254070, "step": 4267, "time_per_iteration": 2.8390700817108154 }, { "auxiliary_loss_clip": 0.01094597, "auxiliary_loss_mlp": 0.01044608, "balance_loss_clip": 1.04643822, "balance_loss_mlp": 1.0276804, "epoch": 0.2566060423868931, "flos": 19099665325440.0, "grad_norm": 1.6765306902124857, "language_loss": 0.79241312, "learning_rate": 3.4850815960999736e-06, "loss": 0.81380516, "num_input_tokens_seen": 92275060, "step": 4268, "time_per_iteration": 2.7324178218841553 }, { "auxiliary_loss_clip": 0.01106667, "auxiliary_loss_mlp": 0.00778662, "balance_loss_clip": 1.04940808, "balance_loss_mlp": 1.00098729, "epoch": 0.25666616563956113, "flos": 23842674656640.0, "grad_norm": 1.8248642507450341, "language_loss": 0.67737979, "learning_rate": 3.484820706183595e-06, "loss": 0.69623303, "num_input_tokens_seen": 92293610, "step": 4269, "time_per_iteration": 2.7897677421569824 }, { "auxiliary_loss_clip": 0.01123993, "auxiliary_loss_mlp": 0.01043408, "balance_loss_clip": 1.05155373, "balance_loss_mlp": 1.02596736, "epoch": 0.2567262888922291, "flos": 14604259420800.0, "grad_norm": 3.069203267679029, "language_loss": 0.79117787, "learning_rate": 3.484559759962666e-06, "loss": 0.81285185, "num_input_tokens_seen": 92308305, "step": 4270, "time_per_iteration": 2.8076114654541016 }, { "auxiliary_loss_clip": 0.01094814, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.04357839, "balance_loss_mlp": 1.02393079, "epoch": 0.25678641214489706, "flos": 32923117877760.0, "grad_norm": 2.413207422396751, "language_loss": 0.68088073, "learning_rate": 3.4842987574470816e-06, "loss": 0.7022649, "num_input_tokens_seen": 92329875, "step": 4271, "time_per_iteration": 2.8195667266845703 }, { "auxiliary_loss_clip": 0.01136281, "auxiliary_loss_mlp": 0.00774788, "balance_loss_clip": 1.05146289, "balance_loss_mlp": 1.00110972, "epoch": 0.256846535397565, "flos": 24098933260800.0, "grad_norm": 3.3671515903121216, "language_loss": 0.87362605, "learning_rate": 3.4840376986467403e-06, "loss": 0.89273679, "num_input_tokens_seen": 92348780, "step": 4272, "time_per_iteration": 2.6910364627838135 }, { "auxiliary_loss_clip": 0.01122968, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.05348301, "balance_loss_mlp": 1.02854192, "epoch": 0.256906658650233, "flos": 19718441942400.0, "grad_norm": 1.6813472119330561, "language_loss": 0.81420678, "learning_rate": 3.483776583571541e-06, "loss": 0.83589977, "num_input_tokens_seen": 92368175, "step": 4273, "time_per_iteration": 2.6883673667907715 }, { "auxiliary_loss_clip": 0.01097944, "auxiliary_loss_mlp": 0.01041741, "balance_loss_clip": 1.043715, "balance_loss_mlp": 1.02459884, "epoch": 0.25696678190290095, "flos": 22926018551040.0, "grad_norm": 3.3251008044769947, "language_loss": 0.76944637, "learning_rate": 3.4835154122313846e-06, "loss": 0.79084325, "num_input_tokens_seen": 92387755, "step": 4274, "time_per_iteration": 2.7613401412963867 }, { "auxiliary_loss_clip": 0.01112797, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.04380774, "balance_loss_mlp": 1.02220166, "epoch": 0.2570269051555689, "flos": 27307838672640.0, "grad_norm": 2.1172072427968933, "language_loss": 0.83780324, "learning_rate": 3.4832541846361743e-06, "loss": 0.85932392, "num_input_tokens_seen": 92409850, "step": 4275, "time_per_iteration": 2.7835779190063477 }, { "auxiliary_loss_clip": 0.01120289, "auxiliary_loss_mlp": 0.01039714, "balance_loss_clip": 1.05141211, "balance_loss_mlp": 1.02223814, "epoch": 0.2570870284082369, "flos": 27563414918400.0, "grad_norm": 2.725989678545036, "language_loss": 0.7874397, "learning_rate": 3.4829929007958175e-06, "loss": 0.80903983, "num_input_tokens_seen": 92431250, "step": 4276, "time_per_iteration": 5.679298400878906 }, { "auxiliary_loss_clip": 0.01136261, "auxiliary_loss_mlp": 0.01046327, "balance_loss_clip": 1.05269814, "balance_loss_mlp": 1.02982879, "epoch": 0.25714715166090485, "flos": 28730834847360.0, "grad_norm": 4.7083902318823885, "language_loss": 0.79273927, "learning_rate": 3.4827315607202214e-06, "loss": 0.81456512, "num_input_tokens_seen": 92452065, "step": 4277, "time_per_iteration": 2.691035270690918 }, { "auxiliary_loss_clip": 0.01146238, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.05214763, "balance_loss_mlp": 1.02367437, "epoch": 0.2572072749135728, "flos": 20116152305280.0, "grad_norm": 2.017980063834791, "language_loss": 0.78986102, "learning_rate": 3.482470164419295e-06, "loss": 0.81172454, "num_input_tokens_seen": 92470025, "step": 4278, "time_per_iteration": 4.2404680252075195 }, { "auxiliary_loss_clip": 0.01126121, "auxiliary_loss_mlp": 0.01037901, "balance_loss_clip": 1.05176449, "balance_loss_mlp": 1.02102113, "epoch": 0.2572673981662408, "flos": 26030855283840.0, "grad_norm": 2.8070462448385904, "language_loss": 0.74898899, "learning_rate": 3.482208711902952e-06, "loss": 0.77062923, "num_input_tokens_seen": 92489825, "step": 4279, "time_per_iteration": 2.65977144241333 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.01051687, "balance_loss_clip": 1.04973292, "balance_loss_mlp": 1.03423464, "epoch": 0.25732752141890874, "flos": 16106618695680.0, "grad_norm": 2.4256697448035687, "language_loss": 0.85603923, "learning_rate": 3.4819472031811065e-06, "loss": 0.87790298, "num_input_tokens_seen": 92507270, "step": 4280, "time_per_iteration": 2.6072864532470703 }, { "auxiliary_loss_clip": 0.01136623, "auxiliary_loss_mlp": 0.01039056, "balance_loss_clip": 1.05183434, "balance_loss_mlp": 1.02147269, "epoch": 0.2573876446715767, "flos": 22524429519360.0, "grad_norm": 3.9579835716917695, "language_loss": 0.79381943, "learning_rate": 3.4816856382636744e-06, "loss": 0.8155762, "num_input_tokens_seen": 92526300, "step": 4281, "time_per_iteration": 2.613163471221924 }, { "auxiliary_loss_clip": 0.01110196, "auxiliary_loss_mlp": 0.01038018, "balance_loss_clip": 1.04847932, "balance_loss_mlp": 1.02099478, "epoch": 0.2574477679242447, "flos": 23950837486080.0, "grad_norm": 2.240063499401578, "language_loss": 0.87314785, "learning_rate": 3.4814240171605737e-06, "loss": 0.89462996, "num_input_tokens_seen": 92546465, "step": 4282, "time_per_iteration": 4.489396333694458 }, { "auxiliary_loss_clip": 0.01148783, "auxiliary_loss_mlp": 0.01046594, "balance_loss_clip": 1.0526619, "balance_loss_mlp": 1.02959502, "epoch": 0.2575078911769127, "flos": 21981711951360.0, "grad_norm": 1.5167715532309152, "language_loss": 0.70110047, "learning_rate": 3.4811623398817267e-06, "loss": 0.72305429, "num_input_tokens_seen": 92567260, "step": 4283, "time_per_iteration": 2.619131565093994 }, { "auxiliary_loss_clip": 0.01144466, "auxiliary_loss_mlp": 0.00774605, "balance_loss_clip": 1.05443883, "balance_loss_mlp": 1.0010494, "epoch": 0.25756801442958066, "flos": 21945406279680.0, "grad_norm": 1.950947388276708, "language_loss": 0.80411774, "learning_rate": 3.4809006064370553e-06, "loss": 0.82330847, "num_input_tokens_seen": 92585425, "step": 4284, "time_per_iteration": 2.656998634338379 }, { "auxiliary_loss_clip": 0.01105473, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.05797076, "balance_loss_mlp": 1.02488899, "epoch": 0.2576281376822486, "flos": 35261980058880.0, "grad_norm": 2.2559612506718434, "language_loss": 0.70473522, "learning_rate": 3.4806388168364835e-06, "loss": 0.72618985, "num_input_tokens_seen": 92604770, "step": 4285, "time_per_iteration": 2.880835771560669 }, { "auxiliary_loss_clip": 0.01127807, "auxiliary_loss_mlp": 0.0104515, "balance_loss_clip": 1.05229783, "balance_loss_mlp": 1.02971268, "epoch": 0.2576882609349166, "flos": 14132285688960.0, "grad_norm": 1.8739093647405893, "language_loss": 0.58494061, "learning_rate": 3.4803769710899402e-06, "loss": 0.6066702, "num_input_tokens_seen": 92622635, "step": 4286, "time_per_iteration": 2.63923978805542 }, { "auxiliary_loss_clip": 0.01138174, "auxiliary_loss_mlp": 0.01046794, "balance_loss_clip": 1.05271184, "balance_loss_mlp": 1.03020048, "epoch": 0.25774838418758456, "flos": 23258336204160.0, "grad_norm": 1.4732857929087761, "language_loss": 0.63687879, "learning_rate": 3.480115069207354e-06, "loss": 0.65872842, "num_input_tokens_seen": 92642960, "step": 4287, "time_per_iteration": 2.67764949798584 }, { "auxiliary_loss_clip": 0.01127889, "auxiliary_loss_mlp": 0.01045385, "balance_loss_clip": 1.05252934, "balance_loss_mlp": 1.02769411, "epoch": 0.2578085074402525, "flos": 22601745544320.0, "grad_norm": 2.134546441867425, "language_loss": 0.71780413, "learning_rate": 3.4798531111986557e-06, "loss": 0.73953688, "num_input_tokens_seen": 92662455, "step": 4288, "time_per_iteration": 2.7174036502838135 }, { "auxiliary_loss_clip": 0.0110996, "auxiliary_loss_mlp": 0.01042748, "balance_loss_clip": 1.04934072, "balance_loss_mlp": 1.02691674, "epoch": 0.2578686306929205, "flos": 24571840746240.0, "grad_norm": 1.4449800602700236, "language_loss": 0.77059102, "learning_rate": 3.4795910970737786e-06, "loss": 0.79211813, "num_input_tokens_seen": 92683520, "step": 4289, "time_per_iteration": 2.748249053955078 }, { "auxiliary_loss_clip": 0.01146276, "auxiliary_loss_mlp": 0.00775089, "balance_loss_clip": 1.05252326, "balance_loss_mlp": 1.001122, "epoch": 0.25792875394558845, "flos": 18113953322880.0, "grad_norm": 2.0235699584636295, "language_loss": 0.85416883, "learning_rate": 3.4793290268426592e-06, "loss": 0.87338245, "num_input_tokens_seen": 92701450, "step": 4290, "time_per_iteration": 2.593461751937866 }, { "auxiliary_loss_clip": 0.01114221, "auxiliary_loss_mlp": 0.01056837, "balance_loss_clip": 1.05081999, "balance_loss_mlp": 1.03660691, "epoch": 0.2579888771982564, "flos": 17712902995200.0, "grad_norm": 2.4272093439618847, "language_loss": 0.72360331, "learning_rate": 3.4790669005152354e-06, "loss": 0.74531388, "num_input_tokens_seen": 92720355, "step": 4291, "time_per_iteration": 2.6838138103485107 }, { "auxiliary_loss_clip": 0.01150945, "auxiliary_loss_mlp": 0.0104494, "balance_loss_clip": 1.05378067, "balance_loss_mlp": 1.02758288, "epoch": 0.2580490004509244, "flos": 16434878112000.0, "grad_norm": 2.78045823134535, "language_loss": 0.80846477, "learning_rate": 3.4788047181014458e-06, "loss": 0.83042365, "num_input_tokens_seen": 92736755, "step": 4292, "time_per_iteration": 2.595710277557373 }, { "auxiliary_loss_clip": 0.0115367, "auxiliary_loss_mlp": 0.01044878, "balance_loss_clip": 1.05773902, "balance_loss_mlp": 1.02702022, "epoch": 0.25810912370359235, "flos": 33835141128960.0, "grad_norm": 2.057533015633898, "language_loss": 0.67592025, "learning_rate": 3.4785424796112337e-06, "loss": 0.69790578, "num_input_tokens_seen": 92757655, "step": 4293, "time_per_iteration": 2.699570894241333 }, { "auxiliary_loss_clip": 0.0110485, "auxiliary_loss_mlp": 0.01048043, "balance_loss_clip": 1.04971898, "balance_loss_mlp": 1.03190207, "epoch": 0.2581692469562603, "flos": 25192197561600.0, "grad_norm": 2.0097854631835217, "language_loss": 0.75671911, "learning_rate": 3.478280185054542e-06, "loss": 0.77824801, "num_input_tokens_seen": 92776100, "step": 4294, "time_per_iteration": 2.7217960357666016 }, { "auxiliary_loss_clip": 0.01098332, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.0444684, "balance_loss_mlp": 1.03404188, "epoch": 0.2582293702089283, "flos": 34932212271360.0, "grad_norm": 1.7798433628760433, "language_loss": 0.8047998, "learning_rate": 3.478017834441318e-06, "loss": 0.82631868, "num_input_tokens_seen": 92798880, "step": 4295, "time_per_iteration": 2.871460437774658 }, { "auxiliary_loss_clip": 0.01055358, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.04843688, "balance_loss_mlp": 1.0256989, "epoch": 0.2582894934615963, "flos": 26833746038400.0, "grad_norm": 2.1012913939780753, "language_loss": 0.72843397, "learning_rate": 3.4777554277815096e-06, "loss": 0.74942946, "num_input_tokens_seen": 92817750, "step": 4296, "time_per_iteration": 3.173367738723755 }, { "auxiliary_loss_clip": 0.01091622, "auxiliary_loss_mlp": 0.01038465, "balance_loss_clip": 1.05392241, "balance_loss_mlp": 1.02106011, "epoch": 0.25834961671426426, "flos": 23515241253120.0, "grad_norm": 1.5772062283828172, "language_loss": 0.86928564, "learning_rate": 3.477492965085067e-06, "loss": 0.8905865, "num_input_tokens_seen": 92837995, "step": 4297, "time_per_iteration": 3.1598868370056152 }, { "auxiliary_loss_clip": 0.01149748, "auxiliary_loss_mlp": 0.01047412, "balance_loss_clip": 1.05517435, "balance_loss_mlp": 1.03090191, "epoch": 0.25840973996693223, "flos": 22451028076800.0, "grad_norm": 1.8030727150796175, "language_loss": 0.84720427, "learning_rate": 3.477230446361943e-06, "loss": 0.86917591, "num_input_tokens_seen": 92857245, "step": 4298, "time_per_iteration": 2.632448196411133 }, { "auxiliary_loss_clip": 0.01135108, "auxiliary_loss_mlp": 0.00775458, "balance_loss_clip": 1.05262494, "balance_loss_mlp": 1.00111055, "epoch": 0.2584698632196002, "flos": 11290854366720.0, "grad_norm": 2.0124667048247686, "language_loss": 0.83514953, "learning_rate": 3.4769678716220927e-06, "loss": 0.8542552, "num_input_tokens_seen": 92873265, "step": 4299, "time_per_iteration": 2.631248950958252 }, { "auxiliary_loss_clip": 0.01117485, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.05216849, "balance_loss_mlp": 1.01868308, "epoch": 0.25852998647226816, "flos": 17929982839680.0, "grad_norm": 2.419754138344463, "language_loss": 0.82422709, "learning_rate": 3.4767052408754726e-06, "loss": 0.84575242, "num_input_tokens_seen": 92890880, "step": 4300, "time_per_iteration": 2.650834083557129 }, { "auxiliary_loss_clip": 0.0113846, "auxiliary_loss_mlp": 0.01041208, "balance_loss_clip": 1.0535903, "balance_loss_mlp": 1.02343392, "epoch": 0.2585901097249361, "flos": 33256117889280.0, "grad_norm": 2.971673559214411, "language_loss": 0.66949177, "learning_rate": 3.4764425541320417e-06, "loss": 0.69128841, "num_input_tokens_seen": 92910770, "step": 4301, "time_per_iteration": 2.729519844055176 }, { "auxiliary_loss_clip": 0.01139778, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.05335701, "balance_loss_mlp": 1.02245533, "epoch": 0.2586502329776041, "flos": 18441278985600.0, "grad_norm": 2.29820997177689, "language_loss": 0.81177735, "learning_rate": 3.4761798114017617e-06, "loss": 0.83357668, "num_input_tokens_seen": 92929520, "step": 4302, "time_per_iteration": 2.5496692657470703 }, { "auxiliary_loss_clip": 0.01105433, "auxiliary_loss_mlp": 0.01042423, "balance_loss_clip": 1.05242491, "balance_loss_mlp": 1.02542388, "epoch": 0.25871035623027205, "flos": 17968120104960.0, "grad_norm": 1.8036447001063776, "language_loss": 0.92147923, "learning_rate": 3.475917012694595e-06, "loss": 0.94295776, "num_input_tokens_seen": 92947890, "step": 4303, "time_per_iteration": 2.686222791671753 }, { "auxiliary_loss_clip": 0.01141887, "auxiliary_loss_mlp": 0.01040139, "balance_loss_clip": 1.05643094, "balance_loss_mlp": 1.02322304, "epoch": 0.25877047948294, "flos": 27777729415680.0, "grad_norm": 2.7085759571044368, "language_loss": 0.67138135, "learning_rate": 3.475654158020507e-06, "loss": 0.69320166, "num_input_tokens_seen": 92967690, "step": 4304, "time_per_iteration": 2.665797472000122 }, { "auxiliary_loss_clip": 0.01113882, "auxiliary_loss_mlp": 0.01041979, "balance_loss_clip": 1.0509342, "balance_loss_mlp": 1.02498007, "epoch": 0.258830602735608, "flos": 27125843437440.0, "grad_norm": 2.126938769919949, "language_loss": 0.72085559, "learning_rate": 3.4753912473894657e-06, "loss": 0.74241412, "num_input_tokens_seen": 92986830, "step": 4305, "time_per_iteration": 2.7514076232910156 }, { "auxiliary_loss_clip": 0.01103045, "auxiliary_loss_mlp": 0.00775987, "balance_loss_clip": 1.04804707, "balance_loss_mlp": 1.00122118, "epoch": 0.25889072598827595, "flos": 17891486438400.0, "grad_norm": 6.414506312387852, "language_loss": 0.76175749, "learning_rate": 3.4751282808114403e-06, "loss": 0.78054774, "num_input_tokens_seen": 93002740, "step": 4306, "time_per_iteration": 2.7326161861419678 }, { "auxiliary_loss_clip": 0.01049461, "auxiliary_loss_mlp": 0.0102188, "balance_loss_clip": 1.03476799, "balance_loss_mlp": 1.01943636, "epoch": 0.2589508492409439, "flos": 53934955724160.0, "grad_norm": 0.8427062291747792, "language_loss": 0.57128024, "learning_rate": 3.474865258296403e-06, "loss": 0.59199357, "num_input_tokens_seen": 93058645, "step": 4307, "time_per_iteration": 3.1499595642089844 }, { "auxiliary_loss_clip": 0.01123356, "auxiliary_loss_mlp": 0.01045032, "balance_loss_clip": 1.0514828, "balance_loss_mlp": 1.02858078, "epoch": 0.2590109724936119, "flos": 22125785402880.0, "grad_norm": 1.5299746109283647, "language_loss": 0.71727359, "learning_rate": 3.474602179854327e-06, "loss": 0.73895752, "num_input_tokens_seen": 93077140, "step": 4308, "time_per_iteration": 2.6824283599853516 }, { "auxiliary_loss_clip": 0.01152705, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05659723, "balance_loss_mlp": 1.02976048, "epoch": 0.2590710957462799, "flos": 13474294398720.0, "grad_norm": 1.8339599204524273, "language_loss": 0.83940542, "learning_rate": 3.4743390454951886e-06, "loss": 0.86140084, "num_input_tokens_seen": 93093580, "step": 4309, "time_per_iteration": 2.560194253921509 }, { "auxiliary_loss_clip": 0.01137306, "auxiliary_loss_mlp": 0.01044025, "balance_loss_clip": 1.05587196, "balance_loss_mlp": 1.02815771, "epoch": 0.25913121899894787, "flos": 22307098279680.0, "grad_norm": 1.5397823214091813, "language_loss": 0.84657532, "learning_rate": 3.474075855228966e-06, "loss": 0.86838865, "num_input_tokens_seen": 93112345, "step": 4310, "time_per_iteration": 2.627716064453125 }, { "auxiliary_loss_clip": 0.01143598, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.05802059, "balance_loss_mlp": 1.03141904, "epoch": 0.25919134225161583, "flos": 25811728364160.0, "grad_norm": 2.0190220849922094, "language_loss": 0.77145267, "learning_rate": 3.473812609065639e-06, "loss": 0.79336536, "num_input_tokens_seen": 93131545, "step": 4311, "time_per_iteration": 2.694856643676758 }, { "auxiliary_loss_clip": 0.01110239, "auxiliary_loss_mlp": 0.01052381, "balance_loss_clip": 1.04629123, "balance_loss_mlp": 1.03498793, "epoch": 0.2592514655042838, "flos": 31212262108800.0, "grad_norm": 1.9233367952735905, "language_loss": 0.72848439, "learning_rate": 3.4735493070151904e-06, "loss": 0.75011057, "num_input_tokens_seen": 93150730, "step": 4312, "time_per_iteration": 2.7577714920043945 }, { "auxiliary_loss_clip": 0.01150768, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05618715, "balance_loss_mlp": 1.02845287, "epoch": 0.25931158875695176, "flos": 18474998878080.0, "grad_norm": 1.8485738044524733, "language_loss": 0.70193493, "learning_rate": 3.4732859490876044e-06, "loss": 0.72388697, "num_input_tokens_seen": 93167895, "step": 4313, "time_per_iteration": 2.6447813510894775 }, { "auxiliary_loss_clip": 0.01150117, "auxiliary_loss_mlp": 0.01054192, "balance_loss_clip": 1.05624926, "balance_loss_mlp": 1.03845656, "epoch": 0.2593717120096197, "flos": 19207935895680.0, "grad_norm": 1.8538125013537565, "language_loss": 0.80462205, "learning_rate": 3.473022535292867e-06, "loss": 0.82666522, "num_input_tokens_seen": 93187650, "step": 4314, "time_per_iteration": 2.6073296070098877 }, { "auxiliary_loss_clip": 0.01110006, "auxiliary_loss_mlp": 0.01049511, "balance_loss_clip": 1.04867387, "balance_loss_mlp": 1.03253555, "epoch": 0.2594318352622877, "flos": 31248100903680.0, "grad_norm": 2.061113629574459, "language_loss": 0.670748, "learning_rate": 3.472759065640968e-06, "loss": 0.69234318, "num_input_tokens_seen": 93207370, "step": 4315, "time_per_iteration": 6.427948236465454 }, { "auxiliary_loss_clip": 0.01096074, "auxiliary_loss_mlp": 0.01056601, "balance_loss_clip": 1.04853845, "balance_loss_mlp": 1.0407939, "epoch": 0.25949195851495566, "flos": 22237144542720.0, "grad_norm": 2.0096953575355125, "language_loss": 0.79649067, "learning_rate": 3.4724955401418976e-06, "loss": 0.81801736, "num_input_tokens_seen": 93227925, "step": 4316, "time_per_iteration": 2.7463796138763428 }, { "auxiliary_loss_clip": 0.01096584, "auxiliary_loss_mlp": 0.01048328, "balance_loss_clip": 1.0487628, "balance_loss_mlp": 1.03112638, "epoch": 0.2595520817676236, "flos": 28075716645120.0, "grad_norm": 3.2727308584132584, "language_loss": 0.77498394, "learning_rate": 3.4722319588056487e-06, "loss": 0.79643309, "num_input_tokens_seen": 93250020, "step": 4317, "time_per_iteration": 4.658867359161377 }, { "auxiliary_loss_clip": 0.01155612, "auxiliary_loss_mlp": 0.01054128, "balance_loss_clip": 1.05959845, "balance_loss_mlp": 1.03734958, "epoch": 0.2596122050202916, "flos": 20190954378240.0, "grad_norm": 2.117435309152476, "language_loss": 0.77656054, "learning_rate": 3.4719683216422163e-06, "loss": 0.79865795, "num_input_tokens_seen": 93269070, "step": 4318, "time_per_iteration": 2.5934906005859375 }, { "auxiliary_loss_clip": 0.01146449, "auxiliary_loss_mlp": 0.01045441, "balance_loss_clip": 1.0530901, "balance_loss_mlp": 1.02733302, "epoch": 0.25967232827295955, "flos": 22527949052160.0, "grad_norm": 1.6144223240331488, "language_loss": 0.76362926, "learning_rate": 3.471704628661598e-06, "loss": 0.78554815, "num_input_tokens_seen": 93290250, "step": 4319, "time_per_iteration": 2.607649564743042 }, { "auxiliary_loss_clip": 0.01125042, "auxiliary_loss_mlp": 0.01041624, "balance_loss_clip": 1.05419481, "balance_loss_mlp": 1.02587628, "epoch": 0.2597324515256275, "flos": 21068252156160.0, "grad_norm": 1.6090277746740278, "language_loss": 0.76549125, "learning_rate": 3.4714408798737925e-06, "loss": 0.78715789, "num_input_tokens_seen": 93310090, "step": 4320, "time_per_iteration": 2.722574472427368 }, { "auxiliary_loss_clip": 0.01116281, "auxiliary_loss_mlp": 0.01042709, "balance_loss_clip": 1.05157554, "balance_loss_mlp": 1.02546, "epoch": 0.2597925747782955, "flos": 22050013662720.0, "grad_norm": 1.6564648175426406, "language_loss": 0.71067965, "learning_rate": 3.471177075288801e-06, "loss": 0.73226953, "num_input_tokens_seen": 93329570, "step": 4321, "time_per_iteration": 4.276093244552612 }, { "auxiliary_loss_clip": 0.01125031, "auxiliary_loss_mlp": 0.01055033, "balance_loss_clip": 1.05191207, "balance_loss_mlp": 1.03549457, "epoch": 0.2598526980309635, "flos": 19536949497600.0, "grad_norm": 1.9031382952841078, "language_loss": 0.74805915, "learning_rate": 3.4709132149166277e-06, "loss": 0.76985979, "num_input_tokens_seen": 93347920, "step": 4322, "time_per_iteration": 2.6573097705841064 }, { "auxiliary_loss_clip": 0.0111558, "auxiliary_loss_mlp": 0.0104757, "balance_loss_clip": 1.05213332, "balance_loss_mlp": 1.03004622, "epoch": 0.25991282128363147, "flos": 24495207079680.0, "grad_norm": 1.8978708709823064, "language_loss": 0.73837054, "learning_rate": 3.470649298767278e-06, "loss": 0.76000202, "num_input_tokens_seen": 93367145, "step": 4323, "time_per_iteration": 2.75765061378479 }, { "auxiliary_loss_clip": 0.01139686, "auxiliary_loss_mlp": 0.00775622, "balance_loss_clip": 1.0509938, "balance_loss_mlp": 1.00099182, "epoch": 0.25997294453629943, "flos": 24201457655040.0, "grad_norm": 2.107506603705316, "language_loss": 0.67186093, "learning_rate": 3.4703853268507597e-06, "loss": 0.69101399, "num_input_tokens_seen": 93386555, "step": 4324, "time_per_iteration": 2.752307891845703 }, { "auxiliary_loss_clip": 0.0109649, "auxiliary_loss_mlp": 0.01045367, "balance_loss_clip": 1.05030632, "balance_loss_mlp": 1.03026319, "epoch": 0.2600330677889674, "flos": 31431460855680.0, "grad_norm": 2.121769328280442, "language_loss": 0.71064055, "learning_rate": 3.470121299177082e-06, "loss": 0.732059, "num_input_tokens_seen": 93405590, "step": 4325, "time_per_iteration": 2.824281692504883 }, { "auxiliary_loss_clip": 0.01134613, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.04941416, "balance_loss_mlp": 1.01839304, "epoch": 0.26009319104163536, "flos": 32266527217920.0, "grad_norm": 1.8496839878379767, "language_loss": 0.73106551, "learning_rate": 3.469857215756257e-06, "loss": 0.75276732, "num_input_tokens_seen": 93424750, "step": 4326, "time_per_iteration": 2.7235658168792725 }, { "auxiliary_loss_clip": 0.01118123, "auxiliary_loss_mlp": 0.00776184, "balance_loss_clip": 1.05001175, "balance_loss_mlp": 1.00100303, "epoch": 0.26015331429430333, "flos": 26286754752000.0, "grad_norm": 1.7229255626307804, "language_loss": 0.86908734, "learning_rate": 3.4695930765982997e-06, "loss": 0.88803041, "num_input_tokens_seen": 93443465, "step": 4327, "time_per_iteration": 2.7072155475616455 }, { "auxiliary_loss_clip": 0.01153995, "auxiliary_loss_mlp": 0.00775932, "balance_loss_clip": 1.05640841, "balance_loss_mlp": 1.0008533, "epoch": 0.2602134375469713, "flos": 21142335957120.0, "grad_norm": 1.4664721830580452, "language_loss": 0.80265766, "learning_rate": 3.4693288817132255e-06, "loss": 0.82195687, "num_input_tokens_seen": 93462580, "step": 4328, "time_per_iteration": 2.6463024616241455 }, { "auxiliary_loss_clip": 0.0111992, "auxiliary_loss_mlp": 0.00774533, "balance_loss_clip": 1.04837036, "balance_loss_mlp": 1.00092077, "epoch": 0.26027356079963926, "flos": 25921327737600.0, "grad_norm": 1.6317826670237516, "language_loss": 0.88094193, "learning_rate": 3.4690646311110525e-06, "loss": 0.89988649, "num_input_tokens_seen": 93482790, "step": 4329, "time_per_iteration": 2.7130861282348633 }, { "auxiliary_loss_clip": 0.011478, "auxiliary_loss_mlp": 0.01040633, "balance_loss_clip": 1.05545115, "balance_loss_mlp": 1.02431321, "epoch": 0.2603336840523072, "flos": 26359222440960.0, "grad_norm": 1.8335620949826397, "language_loss": 0.77834195, "learning_rate": 3.468800324801802e-06, "loss": 0.80022621, "num_input_tokens_seen": 93498795, "step": 4330, "time_per_iteration": 2.6223180294036865 }, { "auxiliary_loss_clip": 0.01148961, "auxiliary_loss_mlp": 0.01047898, "balance_loss_clip": 1.0536809, "balance_loss_mlp": 1.03081572, "epoch": 0.2603938073049752, "flos": 23513661054720.0, "grad_norm": 1.5875829464999673, "language_loss": 0.75683081, "learning_rate": 3.4685359627954958e-06, "loss": 0.77879941, "num_input_tokens_seen": 93518335, "step": 4331, "time_per_iteration": 2.6383559703826904 }, { "auxiliary_loss_clip": 0.01130325, "auxiliary_loss_mlp": 0.01042577, "balance_loss_clip": 1.05964541, "balance_loss_mlp": 1.0261023, "epoch": 0.26045393055764315, "flos": 25374300537600.0, "grad_norm": 1.3798785286413686, "language_loss": 0.69174874, "learning_rate": 3.4682715451021584e-06, "loss": 0.71347773, "num_input_tokens_seen": 93539170, "step": 4332, "time_per_iteration": 2.675203800201416 }, { "auxiliary_loss_clip": 0.01117119, "auxiliary_loss_mlp": 0.01048864, "balance_loss_clip": 1.04849494, "balance_loss_mlp": 1.03203201, "epoch": 0.2605140538103111, "flos": 27635272076160.0, "grad_norm": 6.1371153370044915, "language_loss": 0.79897749, "learning_rate": 3.4680070717318174e-06, "loss": 0.82063735, "num_input_tokens_seen": 93558480, "step": 4333, "time_per_iteration": 2.7595479488372803 }, { "auxiliary_loss_clip": 0.01144159, "auxiliary_loss_mlp": 0.01039411, "balance_loss_clip": 1.05260658, "balance_loss_mlp": 1.02317452, "epoch": 0.2605741770629791, "flos": 13769839503360.0, "grad_norm": 1.9478362516602954, "language_loss": 0.80919975, "learning_rate": 3.467742542694501e-06, "loss": 0.83103544, "num_input_tokens_seen": 93575220, "step": 4334, "time_per_iteration": 2.585676670074463 }, { "auxiliary_loss_clip": 0.01121127, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.04868293, "balance_loss_mlp": 1.02051783, "epoch": 0.26063430031564705, "flos": 26031681296640.0, "grad_norm": 1.8490049893982383, "language_loss": 0.8027274, "learning_rate": 3.46747795800024e-06, "loss": 0.82431591, "num_input_tokens_seen": 93597015, "step": 4335, "time_per_iteration": 2.730853796005249 }, { "auxiliary_loss_clip": 0.01060862, "auxiliary_loss_mlp": 0.01054521, "balance_loss_clip": 1.03598261, "balance_loss_mlp": 1.05267298, "epoch": 0.26069442356831507, "flos": 62443809820800.0, "grad_norm": 1.1166557113782816, "language_loss": 0.60850358, "learning_rate": 3.467213317659068e-06, "loss": 0.62965739, "num_input_tokens_seen": 93657775, "step": 4336, "time_per_iteration": 3.1322128772735596 }, { "auxiliary_loss_clip": 0.01111016, "auxiliary_loss_mlp": 0.01046835, "balance_loss_clip": 1.05039525, "balance_loss_mlp": 1.02976441, "epoch": 0.26075454682098304, "flos": 13626376583040.0, "grad_norm": 2.784557437613843, "language_loss": 0.7679469, "learning_rate": 3.46694862168102e-06, "loss": 0.78952539, "num_input_tokens_seen": 93676145, "step": 4337, "time_per_iteration": 2.704305410385132 }, { "auxiliary_loss_clip": 0.0112146, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.04997659, "balance_loss_mlp": 1.02728987, "epoch": 0.260814670073651, "flos": 12126531260160.0, "grad_norm": 2.7677016823816976, "language_loss": 0.74653983, "learning_rate": 3.4666838700761334e-06, "loss": 0.76820505, "num_input_tokens_seen": 93692480, "step": 4338, "time_per_iteration": 2.652679204940796 }, { "auxiliary_loss_clip": 0.01140171, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.05246329, "balance_loss_mlp": 1.02314997, "epoch": 0.26087479332631897, "flos": 15122522805120.0, "grad_norm": 2.378816803290104, "language_loss": 0.81061137, "learning_rate": 3.466419062854447e-06, "loss": 0.8324182, "num_input_tokens_seen": 93710165, "step": 4339, "time_per_iteration": 2.7237682342529297 }, { "auxiliary_loss_clip": 0.01090328, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.04649866, "balance_loss_mlp": 1.02436984, "epoch": 0.26093491657898693, "flos": 24680937329280.0, "grad_norm": 1.6860698424881835, "language_loss": 0.76643449, "learning_rate": 3.4661542000260033e-06, "loss": 0.78773987, "num_input_tokens_seen": 93730185, "step": 4340, "time_per_iteration": 2.817647695541382 }, { "auxiliary_loss_clip": 0.01082903, "auxiliary_loss_mlp": 0.01040837, "balance_loss_clip": 1.04781985, "balance_loss_mlp": 1.02381396, "epoch": 0.2609950398316549, "flos": 25116138512640.0, "grad_norm": 1.954971477972507, "language_loss": 0.82689369, "learning_rate": 3.465889281600845e-06, "loss": 0.84813106, "num_input_tokens_seen": 93747690, "step": 4341, "time_per_iteration": 2.822387218475342 }, { "auxiliary_loss_clip": 0.01148407, "auxiliary_loss_mlp": 0.0104134, "balance_loss_clip": 1.0550344, "balance_loss_mlp": 1.02387536, "epoch": 0.26105516308432286, "flos": 28548588216960.0, "grad_norm": 2.3225619433460083, "language_loss": 0.76828772, "learning_rate": 3.4656243075890183e-06, "loss": 0.79018521, "num_input_tokens_seen": 93767405, "step": 4342, "time_per_iteration": 2.7091987133026123 }, { "auxiliary_loss_clip": 0.01137117, "auxiliary_loss_mlp": 0.01036127, "balance_loss_clip": 1.05262113, "balance_loss_mlp": 1.01837635, "epoch": 0.2611152863369908, "flos": 39530609447040.0, "grad_norm": 1.8380809165191976, "language_loss": 0.66072762, "learning_rate": 3.4653592780005707e-06, "loss": 0.68246007, "num_input_tokens_seen": 93789950, "step": 4343, "time_per_iteration": 2.7885191440582275 }, { "auxiliary_loss_clip": 0.01076135, "auxiliary_loss_mlp": 0.01045298, "balance_loss_clip": 1.04419374, "balance_loss_mlp": 1.02715397, "epoch": 0.2611754095896588, "flos": 13735329511680.0, "grad_norm": 1.9033089414913282, "language_loss": 0.73626471, "learning_rate": 3.465094192845553e-06, "loss": 0.75747907, "num_input_tokens_seen": 93807835, "step": 4344, "time_per_iteration": 2.7622575759887695 }, { "auxiliary_loss_clip": 0.01150726, "auxiliary_loss_mlp": 0.01042349, "balance_loss_clip": 1.05625904, "balance_loss_mlp": 1.02560019, "epoch": 0.26123553284232676, "flos": 21506649649920.0, "grad_norm": 2.7815673216786045, "language_loss": 0.86820161, "learning_rate": 3.4648290521340165e-06, "loss": 0.89013231, "num_input_tokens_seen": 93825670, "step": 4345, "time_per_iteration": 2.615021228790283 }, { "auxiliary_loss_clip": 0.01121997, "auxiliary_loss_mlp": 0.01036853, "balance_loss_clip": 1.05178094, "balance_loss_mlp": 1.02056956, "epoch": 0.2612956560949947, "flos": 21139786091520.0, "grad_norm": 1.9109970692142244, "language_loss": 0.76235008, "learning_rate": 3.464563855876015e-06, "loss": 0.78393853, "num_input_tokens_seen": 93844045, "step": 4346, "time_per_iteration": 2.660766363143921 }, { "auxiliary_loss_clip": 0.01140284, "auxiliary_loss_mlp": 0.01045855, "balance_loss_clip": 1.05571795, "balance_loss_mlp": 1.02870095, "epoch": 0.2613557793476627, "flos": 25119011600640.0, "grad_norm": 1.6628741865434964, "language_loss": 0.75995654, "learning_rate": 3.464298604081606e-06, "loss": 0.78181791, "num_input_tokens_seen": 93864380, "step": 4347, "time_per_iteration": 2.6985979080200195 }, { "auxiliary_loss_clip": 0.0110699, "auxiliary_loss_mlp": 0.01041742, "balance_loss_clip": 1.05063343, "balance_loss_mlp": 1.02501726, "epoch": 0.26141590260033065, "flos": 26067699659520.0, "grad_norm": 1.7474860409603998, "language_loss": 0.73196864, "learning_rate": 3.4640332967608476e-06, "loss": 0.75345594, "num_input_tokens_seen": 93885475, "step": 4348, "time_per_iteration": 2.7511887550354004 }, { "auxiliary_loss_clip": 0.01110529, "auxiliary_loss_mlp": 0.01045849, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.0290519, "epoch": 0.2614760258529987, "flos": 25701518459520.0, "grad_norm": 2.6377025292028944, "language_loss": 0.91262084, "learning_rate": 3.463767933923799e-06, "loss": 0.93418467, "num_input_tokens_seen": 93905545, "step": 4349, "time_per_iteration": 2.720240354537964 }, { "auxiliary_loss_clip": 0.0113714, "auxiliary_loss_mlp": 0.01048228, "balance_loss_clip": 1.05569661, "balance_loss_mlp": 1.03184831, "epoch": 0.26153614910566664, "flos": 17457147181440.0, "grad_norm": 1.7232851278977876, "language_loss": 0.80046499, "learning_rate": 3.463502515580524e-06, "loss": 0.82231867, "num_input_tokens_seen": 93924185, "step": 4350, "time_per_iteration": 2.652054786682129 }, { "auxiliary_loss_clip": 0.0113538, "auxiliary_loss_mlp": 0.01049567, "balance_loss_clip": 1.05652642, "balance_loss_mlp": 1.03299654, "epoch": 0.2615962723583346, "flos": 17712831168000.0, "grad_norm": 10.816271600027287, "language_loss": 0.62736505, "learning_rate": 3.4632370417410866e-06, "loss": 0.64921451, "num_input_tokens_seen": 93942825, "step": 4351, "time_per_iteration": 2.6674954891204834 }, { "auxiliary_loss_clip": 0.01138265, "auxiliary_loss_mlp": 0.01048518, "balance_loss_clip": 1.05201697, "balance_loss_mlp": 1.03168559, "epoch": 0.26165639561100257, "flos": 23257725672960.0, "grad_norm": 1.9014393183165526, "language_loss": 0.84131002, "learning_rate": 3.462971512415555e-06, "loss": 0.86317784, "num_input_tokens_seen": 93962045, "step": 4352, "time_per_iteration": 2.8033063411712646 }, { "auxiliary_loss_clip": 0.01065372, "auxiliary_loss_mlp": 0.0102292, "balance_loss_clip": 1.04145527, "balance_loss_mlp": 1.02078664, "epoch": 0.26171651886367053, "flos": 66737970800640.0, "grad_norm": 0.8050815788583346, "language_loss": 0.70591724, "learning_rate": 3.462705927613996e-06, "loss": 0.7268002, "num_input_tokens_seen": 94021175, "step": 4353, "time_per_iteration": 3.101954936981201 }, { "auxiliary_loss_clip": 0.01115948, "auxiliary_loss_mlp": 0.01069336, "balance_loss_clip": 1.04858005, "balance_loss_mlp": 1.05013168, "epoch": 0.2617766421163385, "flos": 22349581090560.0, "grad_norm": 1.6494861832481549, "language_loss": 0.77562749, "learning_rate": 3.4624402873464816e-06, "loss": 0.79748034, "num_input_tokens_seen": 94043370, "step": 4354, "time_per_iteration": 2.772723436355591 }, { "auxiliary_loss_clip": 0.01089887, "auxiliary_loss_mlp": 0.01058882, "balance_loss_clip": 1.04805279, "balance_loss_mlp": 1.04082203, "epoch": 0.26183676536900646, "flos": 26067125041920.0, "grad_norm": 1.8339738923409379, "language_loss": 0.68351537, "learning_rate": 3.462174591623085e-06, "loss": 0.70500308, "num_input_tokens_seen": 94063510, "step": 4355, "time_per_iteration": 5.908639430999756 }, { "auxiliary_loss_clip": 0.01094509, "auxiliary_loss_mlp": 0.01039879, "balance_loss_clip": 1.0486095, "balance_loss_mlp": 1.02164054, "epoch": 0.26189688862167443, "flos": 20996466825600.0, "grad_norm": 1.9440617828376934, "language_loss": 0.67573452, "learning_rate": 3.4619088404538815e-06, "loss": 0.69707847, "num_input_tokens_seen": 94083865, "step": 4356, "time_per_iteration": 4.351539611816406 }, { "auxiliary_loss_clip": 0.01057297, "auxiliary_loss_mlp": 0.0100707, "balance_loss_clip": 1.03335488, "balance_loss_mlp": 1.00484037, "epoch": 0.2619570118743424, "flos": 65798261141760.0, "grad_norm": 0.6809064288126679, "language_loss": 0.53124392, "learning_rate": 3.4616430338489487e-06, "loss": 0.55188763, "num_input_tokens_seen": 94144095, "step": 4357, "time_per_iteration": 3.0896964073181152 }, { "auxiliary_loss_clip": 0.01139918, "auxiliary_loss_mlp": 0.0104768, "balance_loss_clip": 1.05365348, "balance_loss_mlp": 1.03106248, "epoch": 0.26201713512701036, "flos": 28766817296640.0, "grad_norm": 1.8814759411194193, "language_loss": 0.84233022, "learning_rate": 3.4613771718183654e-06, "loss": 0.86420614, "num_input_tokens_seen": 94163035, "step": 4358, "time_per_iteration": 2.723057746887207 }, { "auxiliary_loss_clip": 0.01127273, "auxiliary_loss_mlp": 0.01043309, "balance_loss_clip": 1.04886353, "balance_loss_mlp": 1.02411628, "epoch": 0.2620772583796783, "flos": 26432516142720.0, "grad_norm": 2.354545555797757, "language_loss": 0.67324048, "learning_rate": 3.4611112543722127e-06, "loss": 0.69494629, "num_input_tokens_seen": 94182520, "step": 4359, "time_per_iteration": 2.7128403186798096 }, { "auxiliary_loss_clip": 0.01118602, "auxiliary_loss_mlp": 0.01045018, "balance_loss_clip": 1.04637527, "balance_loss_mlp": 1.02880526, "epoch": 0.2621373816323463, "flos": 20156552127360.0, "grad_norm": 1.8862311303010293, "language_loss": 0.78726596, "learning_rate": 3.4608452815205757e-06, "loss": 0.80890214, "num_input_tokens_seen": 94201795, "step": 4360, "time_per_iteration": 4.41027569770813 }, { "auxiliary_loss_clip": 0.01119481, "auxiliary_loss_mlp": 0.01042435, "balance_loss_clip": 1.04831719, "balance_loss_mlp": 1.02640164, "epoch": 0.26219750488501425, "flos": 28621235473920.0, "grad_norm": 1.8399079957082187, "language_loss": 0.67980468, "learning_rate": 3.4605792532735387e-06, "loss": 0.70142382, "num_input_tokens_seen": 94222390, "step": 4361, "time_per_iteration": 2.7642054557800293 }, { "auxiliary_loss_clip": 0.01139509, "auxiliary_loss_mlp": 0.01055985, "balance_loss_clip": 1.05313993, "balance_loss_mlp": 1.03842545, "epoch": 0.2622576281376823, "flos": 15042549173760.0, "grad_norm": 2.1489496912575166, "language_loss": 0.84068632, "learning_rate": 3.46031316964119e-06, "loss": 0.86264122, "num_input_tokens_seen": 94239980, "step": 4362, "time_per_iteration": 2.6152050495147705 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01046107, "balance_loss_clip": 1.04752779, "balance_loss_mlp": 1.02867842, "epoch": 0.26231775139035024, "flos": 26396174557440.0, "grad_norm": 2.0545933935481835, "language_loss": 0.65068752, "learning_rate": 3.4600470306336197e-06, "loss": 0.67220271, "num_input_tokens_seen": 94260715, "step": 4363, "time_per_iteration": 2.7297046184539795 }, { "auxiliary_loss_clip": 0.01040739, "auxiliary_loss_mlp": 0.01017272, "balance_loss_clip": 1.02776587, "balance_loss_mlp": 1.01506662, "epoch": 0.2623778746430182, "flos": 65408918647680.0, "grad_norm": 0.9195643121956573, "language_loss": 0.61104208, "learning_rate": 3.4597808362609194e-06, "loss": 0.6316222, "num_input_tokens_seen": 94321285, "step": 4364, "time_per_iteration": 3.3122286796569824 }, { "auxiliary_loss_clip": 0.01151556, "auxiliary_loss_mlp": 0.01050336, "balance_loss_clip": 1.0550462, "balance_loss_mlp": 1.03201365, "epoch": 0.26243799789568617, "flos": 12604215254400.0, "grad_norm": 2.6922753747731387, "language_loss": 0.7223357, "learning_rate": 3.459514586533184e-06, "loss": 0.74435461, "num_input_tokens_seen": 94335420, "step": 4365, "time_per_iteration": 2.588611364364624 }, { "auxiliary_loss_clip": 0.01123747, "auxiliary_loss_mlp": 0.00776591, "balance_loss_clip": 1.05296087, "balance_loss_mlp": 1.00093484, "epoch": 0.26249812114835414, "flos": 28623821253120.0, "grad_norm": 1.9684942716361389, "language_loss": 0.77178609, "learning_rate": 3.459248281460509e-06, "loss": 0.79078948, "num_input_tokens_seen": 94357440, "step": 4366, "time_per_iteration": 2.7489407062530518 }, { "auxiliary_loss_clip": 0.01149499, "auxiliary_loss_mlp": 0.0104305, "balance_loss_clip": 1.05433846, "balance_loss_mlp": 1.02652764, "epoch": 0.2625582444010221, "flos": 14465393441280.0, "grad_norm": 1.9587652436204308, "language_loss": 0.76205176, "learning_rate": 3.4589819210529927e-06, "loss": 0.78397727, "num_input_tokens_seen": 94375690, "step": 4367, "time_per_iteration": 2.63778018951416 }, { "auxiliary_loss_clip": 0.01136158, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.0523572, "balance_loss_mlp": 1.02903318, "epoch": 0.26261836765369007, "flos": 16613174246400.0, "grad_norm": 2.055472748506688, "language_loss": 0.69400585, "learning_rate": 3.458715505320736e-06, "loss": 0.71581888, "num_input_tokens_seen": 94393190, "step": 4368, "time_per_iteration": 2.6515018939971924 }, { "auxiliary_loss_clip": 0.01123905, "auxiliary_loss_mlp": 0.01045619, "balance_loss_clip": 1.05272579, "balance_loss_mlp": 1.02791643, "epoch": 0.26267849090635803, "flos": 20519932066560.0, "grad_norm": 1.8794244148025279, "language_loss": 0.79255176, "learning_rate": 3.458449034273841e-06, "loss": 0.81424701, "num_input_tokens_seen": 94410975, "step": 4369, "time_per_iteration": 2.717142343521118 }, { "auxiliary_loss_clip": 0.01119662, "auxiliary_loss_mlp": 0.01040752, "balance_loss_clip": 1.05190969, "balance_loss_mlp": 1.02344334, "epoch": 0.262738614159026, "flos": 21323936142720.0, "grad_norm": 4.796099217910503, "language_loss": 0.83591807, "learning_rate": 3.4581825079224133e-06, "loss": 0.85752219, "num_input_tokens_seen": 94429985, "step": 4370, "time_per_iteration": 2.742966890335083 }, { "auxiliary_loss_clip": 0.01137822, "auxiliary_loss_mlp": 0.01053822, "balance_loss_clip": 1.05178714, "balance_loss_mlp": 1.0345341, "epoch": 0.26279873741169396, "flos": 17603590930560.0, "grad_norm": 1.7275848609842401, "language_loss": 0.71854705, "learning_rate": 3.4579159262765575e-06, "loss": 0.7404635, "num_input_tokens_seen": 94448660, "step": 4371, "time_per_iteration": 2.691899538040161 }, { "auxiliary_loss_clip": 0.01062293, "auxiliary_loss_mlp": 0.01003561, "balance_loss_clip": 1.02797341, "balance_loss_mlp": 1.00147498, "epoch": 0.2628588606643619, "flos": 60949746587520.0, "grad_norm": 0.6802377941963699, "language_loss": 0.56387627, "learning_rate": 3.457649289346384e-06, "loss": 0.58453482, "num_input_tokens_seen": 94515630, "step": 4372, "time_per_iteration": 3.279158115386963 }, { "auxiliary_loss_clip": 0.01124406, "auxiliary_loss_mlp": 0.01038838, "balance_loss_clip": 1.05295706, "balance_loss_mlp": 1.02169585, "epoch": 0.2629189839170299, "flos": 27016315891200.0, "grad_norm": 1.9842369613103452, "language_loss": 0.77777553, "learning_rate": 3.4573825971420042e-06, "loss": 0.79940796, "num_input_tokens_seen": 94535385, "step": 4373, "time_per_iteration": 2.8367159366607666 }, { "auxiliary_loss_clip": 0.01104424, "auxiliary_loss_mlp": 0.01039426, "balance_loss_clip": 1.05070519, "balance_loss_mlp": 1.02314186, "epoch": 0.26297910716969786, "flos": 17019863009280.0, "grad_norm": 7.588420148526772, "language_loss": 0.71397603, "learning_rate": 3.4571158496735294e-06, "loss": 0.73541456, "num_input_tokens_seen": 94552650, "step": 4374, "time_per_iteration": 2.722332239151001 }, { "auxiliary_loss_clip": 0.0112606, "auxiliary_loss_mlp": 0.01045748, "balance_loss_clip": 1.05836225, "balance_loss_mlp": 1.02748489, "epoch": 0.2630392304223659, "flos": 24897370728960.0, "grad_norm": 1.8414201938467747, "language_loss": 0.81212163, "learning_rate": 3.4568490469510756e-06, "loss": 0.83383965, "num_input_tokens_seen": 94574075, "step": 4375, "time_per_iteration": 2.7654781341552734 }, { "auxiliary_loss_clip": 0.01118996, "auxiliary_loss_mlp": 0.01045139, "balance_loss_clip": 1.04959798, "balance_loss_mlp": 1.02901626, "epoch": 0.26309935367503384, "flos": 32854026067200.0, "grad_norm": 1.6461571134793078, "language_loss": 0.6613251, "learning_rate": 3.4565821889847603e-06, "loss": 0.68296647, "num_input_tokens_seen": 94594255, "step": 4376, "time_per_iteration": 2.778731107711792 }, { "auxiliary_loss_clip": 0.01096695, "auxiliary_loss_mlp": 0.0106417, "balance_loss_clip": 1.04752398, "balance_loss_mlp": 1.04587138, "epoch": 0.2631594769277018, "flos": 15887958652800.0, "grad_norm": 1.7628322447974545, "language_loss": 0.69351411, "learning_rate": 3.4563152757847026e-06, "loss": 0.71512282, "num_input_tokens_seen": 94611410, "step": 4377, "time_per_iteration": 2.7606706619262695 }, { "auxiliary_loss_clip": 0.01141095, "auxiliary_loss_mlp": 0.01043033, "balance_loss_clip": 1.0561285, "balance_loss_mlp": 1.02606952, "epoch": 0.2632196001803698, "flos": 50804943557760.0, "grad_norm": 2.1982489321824352, "language_loss": 0.79961169, "learning_rate": 3.4560483073610233e-06, "loss": 0.82145292, "num_input_tokens_seen": 94636575, "step": 4378, "time_per_iteration": 2.9000468254089355 }, { "auxiliary_loss_clip": 0.01127331, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.05713558, "balance_loss_mlp": 1.03063893, "epoch": 0.26327972343303774, "flos": 13733031041280.0, "grad_norm": 1.912468890890116, "language_loss": 0.76285684, "learning_rate": 3.455781283723846e-06, "loss": 0.78458679, "num_input_tokens_seen": 94654345, "step": 4379, "time_per_iteration": 2.6757192611694336 }, { "auxiliary_loss_clip": 0.01114814, "auxiliary_loss_mlp": 0.01043, "balance_loss_clip": 1.05360019, "balance_loss_mlp": 1.02465415, "epoch": 0.2633398466857057, "flos": 23769057732480.0, "grad_norm": 1.982346793660648, "language_loss": 0.77895945, "learning_rate": 3.4555142048832975e-06, "loss": 0.80053759, "num_input_tokens_seen": 94673985, "step": 4380, "time_per_iteration": 2.745392084121704 }, { "auxiliary_loss_clip": 0.01125918, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04945278, "balance_loss_mlp": 1.02351093, "epoch": 0.26339996993837367, "flos": 27600223380480.0, "grad_norm": 2.2040025999375215, "language_loss": 0.64148676, "learning_rate": 3.4552470708495036e-06, "loss": 0.66315508, "num_input_tokens_seen": 94693145, "step": 4381, "time_per_iteration": 2.8020689487457275 }, { "auxiliary_loss_clip": 0.01136752, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 1.05113709, "balance_loss_mlp": 1.02225995, "epoch": 0.26346009319104163, "flos": 16946317912320.0, "grad_norm": 1.9675616702193486, "language_loss": 0.82470775, "learning_rate": 3.454979881632595e-06, "loss": 0.8464632, "num_input_tokens_seen": 94710185, "step": 4382, "time_per_iteration": 2.66001558303833 }, { "auxiliary_loss_clip": 0.01106019, "auxiliary_loss_mlp": 0.01045742, "balance_loss_clip": 1.04899645, "balance_loss_mlp": 1.02726483, "epoch": 0.2635202164437096, "flos": 37232218915200.0, "grad_norm": 4.511875880791621, "language_loss": 0.70333207, "learning_rate": 3.4547126372427035e-06, "loss": 0.7248497, "num_input_tokens_seen": 94730280, "step": 4383, "time_per_iteration": 2.851227045059204 }, { "auxiliary_loss_clip": 0.01136676, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.05237031, "balance_loss_mlp": 1.0239253, "epoch": 0.26358033969637756, "flos": 20996359084800.0, "grad_norm": 3.019496854013466, "language_loss": 0.69455528, "learning_rate": 3.4544453376899638e-06, "loss": 0.71631902, "num_input_tokens_seen": 94748560, "step": 4384, "time_per_iteration": 2.670023202896118 }, { "auxiliary_loss_clip": 0.01135763, "auxiliary_loss_mlp": 0.01039573, "balance_loss_clip": 1.05114567, "balance_loss_mlp": 1.02275276, "epoch": 0.26364046294904553, "flos": 27746092512000.0, "grad_norm": 2.2712502599605036, "language_loss": 0.70067525, "learning_rate": 3.45417798298451e-06, "loss": 0.72242868, "num_input_tokens_seen": 94767570, "step": 4385, "time_per_iteration": 2.7232449054718018 }, { "auxiliary_loss_clip": 0.01112529, "auxiliary_loss_mlp": 0.0104946, "balance_loss_clip": 1.04893148, "balance_loss_mlp": 1.03190076, "epoch": 0.2637005862017135, "flos": 22893088757760.0, "grad_norm": 1.8128608655109948, "language_loss": 0.85684925, "learning_rate": 3.453910573136482e-06, "loss": 0.87846911, "num_input_tokens_seen": 94784985, "step": 4386, "time_per_iteration": 2.727924108505249 }, { "auxiliary_loss_clip": 0.01126521, "auxiliary_loss_mlp": 0.01046433, "balance_loss_clip": 1.0510478, "balance_loss_mlp": 1.02955282, "epoch": 0.26376070945438146, "flos": 15048834053760.0, "grad_norm": 2.174412940978395, "language_loss": 0.7796396, "learning_rate": 3.4536431081560196e-06, "loss": 0.80136907, "num_input_tokens_seen": 94802545, "step": 4387, "time_per_iteration": 2.666287660598755 }, { "auxiliary_loss_clip": 0.01134058, "auxiliary_loss_mlp": 0.01041407, "balance_loss_clip": 1.05609179, "balance_loss_mlp": 1.02537298, "epoch": 0.2638208327070494, "flos": 21141833166720.0, "grad_norm": 2.003302761742054, "language_loss": 0.76126039, "learning_rate": 3.453375588053264e-06, "loss": 0.78301507, "num_input_tokens_seen": 94820730, "step": 4388, "time_per_iteration": 2.6321358680725098 }, { "auxiliary_loss_clip": 0.01148944, "auxiliary_loss_mlp": 0.01036978, "balance_loss_clip": 1.05455542, "balance_loss_mlp": 1.02002645, "epoch": 0.26388095595971744, "flos": 21725597001600.0, "grad_norm": 2.534815675842734, "language_loss": 0.86675179, "learning_rate": 3.4531080128383617e-06, "loss": 0.88861108, "num_input_tokens_seen": 94839175, "step": 4389, "time_per_iteration": 2.6122422218322754 }, { "auxiliary_loss_clip": 0.01048602, "auxiliary_loss_mlp": 0.01002085, "balance_loss_clip": 1.03000987, "balance_loss_mlp": 0.99961758, "epoch": 0.2639410792123854, "flos": 65515537192320.0, "grad_norm": 0.8388510572165676, "language_loss": 0.60285747, "learning_rate": 3.452840382521457e-06, "loss": 0.62336433, "num_input_tokens_seen": 94898865, "step": 4390, "time_per_iteration": 3.1867401599884033 }, { "auxiliary_loss_clip": 0.01128567, "auxiliary_loss_mlp": 0.01040305, "balance_loss_clip": 1.05022383, "balance_loss_mlp": 1.02319825, "epoch": 0.2640012024650534, "flos": 23948574929280.0, "grad_norm": 1.6144448841655068, "language_loss": 0.77730125, "learning_rate": 3.4525726971127e-06, "loss": 0.79899001, "num_input_tokens_seen": 94917490, "step": 4391, "time_per_iteration": 2.707310676574707 }, { "auxiliary_loss_clip": 0.01031384, "auxiliary_loss_mlp": 0.00755302, "balance_loss_clip": 1.02553821, "balance_loss_mlp": 1.00244236, "epoch": 0.26406132571772134, "flos": 56441163369600.0, "grad_norm": 0.8840896383522404, "language_loss": 0.58758044, "learning_rate": 3.45230495662224e-06, "loss": 0.60544735, "num_input_tokens_seen": 94969065, "step": 4392, "time_per_iteration": 3.211859941482544 }, { "auxiliary_loss_clip": 0.01136937, "auxiliary_loss_mlp": 0.0105019, "balance_loss_clip": 1.05295539, "balance_loss_mlp": 1.03322649, "epoch": 0.2641214489703893, "flos": 22090557139200.0, "grad_norm": 1.9286153229889427, "language_loss": 0.68954027, "learning_rate": 3.4520371610602306e-06, "loss": 0.71141154, "num_input_tokens_seen": 94988540, "step": 4393, "time_per_iteration": 2.6483278274536133 }, { "auxiliary_loss_clip": 0.01140079, "auxiliary_loss_mlp": 0.01041521, "balance_loss_clip": 1.05395103, "balance_loss_mlp": 1.02398562, "epoch": 0.26418157222305727, "flos": 16544764794240.0, "grad_norm": 2.0454829511435193, "language_loss": 0.84071863, "learning_rate": 3.4517693104368267e-06, "loss": 0.86253464, "num_input_tokens_seen": 95004810, "step": 4394, "time_per_iteration": 4.3396079540252686 }, { "auxiliary_loss_clip": 0.01124083, "auxiliary_loss_mlp": 0.01045374, "balance_loss_clip": 1.04999089, "balance_loss_mlp": 1.02661061, "epoch": 0.26424169547572524, "flos": 18002486442240.0, "grad_norm": 2.096391063208514, "language_loss": 0.70044839, "learning_rate": 3.4515014047621856e-06, "loss": 0.72214299, "num_input_tokens_seen": 95024085, "step": 4395, "time_per_iteration": 2.8730056285858154 }, { "auxiliary_loss_clip": 0.01110387, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.04736662, "balance_loss_mlp": 1.02071214, "epoch": 0.2643018187283932, "flos": 16983162288000.0, "grad_norm": 2.1761517020490606, "language_loss": 0.86876452, "learning_rate": 3.4512334440464655e-06, "loss": 0.89024228, "num_input_tokens_seen": 95042515, "step": 4396, "time_per_iteration": 4.384250640869141 }, { "auxiliary_loss_clip": 0.01010716, "auxiliary_loss_mlp": 0.01021406, "balance_loss_clip": 1.02197146, "balance_loss_mlp": 1.01856887, "epoch": 0.26436194198106117, "flos": 59664359416320.0, "grad_norm": 0.7957760850485174, "language_loss": 0.55022657, "learning_rate": 3.4509654282998277e-06, "loss": 0.57054776, "num_input_tokens_seen": 95094835, "step": 4397, "time_per_iteration": 3.0656893253326416 }, { "auxiliary_loss_clip": 0.01132938, "auxiliary_loss_mlp": 0.01050463, "balance_loss_clip": 1.0485754, "balance_loss_mlp": 1.03357744, "epoch": 0.26442206523372913, "flos": 32921322197760.0, "grad_norm": 1.9110208887501443, "language_loss": 0.77881467, "learning_rate": 3.450697357532435e-06, "loss": 0.80064869, "num_input_tokens_seen": 95113480, "step": 4398, "time_per_iteration": 2.740917444229126 }, { "auxiliary_loss_clip": 0.01139914, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.05469537, "balance_loss_mlp": 1.02347112, "epoch": 0.2644821884863971, "flos": 21031300039680.0, "grad_norm": 1.7657486248278176, "language_loss": 0.67534482, "learning_rate": 3.4504292317544534e-06, "loss": 0.69715106, "num_input_tokens_seen": 95132580, "step": 4399, "time_per_iteration": 4.305487871170044 }, { "auxiliary_loss_clip": 0.01097219, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.04840231, "balance_loss_mlp": 1.02503681, "epoch": 0.26454231173906506, "flos": 20776801201920.0, "grad_norm": 1.6309197312133479, "language_loss": 0.86614597, "learning_rate": 3.4501610509760504e-06, "loss": 0.88753855, "num_input_tokens_seen": 95152375, "step": 4400, "time_per_iteration": 2.695883274078369 }, { "auxiliary_loss_clip": 0.01119339, "auxiliary_loss_mlp": 0.01039987, "balance_loss_clip": 1.0483284, "balance_loss_mlp": 1.0226419, "epoch": 0.264602434991733, "flos": 16618669027200.0, "grad_norm": 3.1942141071602546, "language_loss": 0.76518428, "learning_rate": 3.4498928152073944e-06, "loss": 0.78677756, "num_input_tokens_seen": 95170265, "step": 4401, "time_per_iteration": 2.69415545463562 }, { "auxiliary_loss_clip": 0.01100665, "auxiliary_loss_mlp": 0.01046326, "balance_loss_clip": 1.04473615, "balance_loss_mlp": 1.02758598, "epoch": 0.26466255824440105, "flos": 19062677295360.0, "grad_norm": 2.336049134907364, "language_loss": 0.88363832, "learning_rate": 3.4496245244586577e-06, "loss": 0.90510821, "num_input_tokens_seen": 95188655, "step": 4402, "time_per_iteration": 2.7073450088500977 }, { "auxiliary_loss_clip": 0.01105803, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.04894042, "balance_loss_mlp": 1.02327585, "epoch": 0.264722681497069, "flos": 22638554006400.0, "grad_norm": 1.7301089969072252, "language_loss": 0.7811445, "learning_rate": 3.4493561787400137e-06, "loss": 0.80260956, "num_input_tokens_seen": 95209615, "step": 4403, "time_per_iteration": 2.7213027477264404 }, { "auxiliary_loss_clip": 0.01128649, "auxiliary_loss_mlp": 0.01038032, "balance_loss_clip": 1.04674816, "balance_loss_mlp": 1.02050877, "epoch": 0.264782804749737, "flos": 22492253911680.0, "grad_norm": 2.1369132533571604, "language_loss": 0.88594282, "learning_rate": 3.4490877780616387e-06, "loss": 0.90760964, "num_input_tokens_seen": 95227810, "step": 4404, "time_per_iteration": 2.6888909339904785 }, { "auxiliary_loss_clip": 0.01123789, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.04607344, "balance_loss_mlp": 1.02416539, "epoch": 0.26484292800240494, "flos": 16800269212800.0, "grad_norm": 1.7519644069859235, "language_loss": 0.76134694, "learning_rate": 3.448819322433709e-06, "loss": 0.78299075, "num_input_tokens_seen": 95245890, "step": 4405, "time_per_iteration": 2.7172482013702393 }, { "auxiliary_loss_clip": 0.01148976, "auxiliary_loss_mlp": 0.01040198, "balance_loss_clip": 1.05348206, "balance_loss_mlp": 1.02266204, "epoch": 0.2649030512550729, "flos": 20449583280000.0, "grad_norm": 1.711457274305917, "language_loss": 0.69873697, "learning_rate": 3.4485508118664066e-06, "loss": 0.72062874, "num_input_tokens_seen": 95264955, "step": 4406, "time_per_iteration": 2.584300994873047 }, { "auxiliary_loss_clip": 0.01121151, "auxiliary_loss_mlp": 0.01050453, "balance_loss_clip": 1.05182838, "balance_loss_mlp": 1.03432453, "epoch": 0.2649631745077409, "flos": 22416123035520.0, "grad_norm": 1.7200250795424956, "language_loss": 0.83956587, "learning_rate": 3.448282246369912e-06, "loss": 0.86128193, "num_input_tokens_seen": 95284245, "step": 4407, "time_per_iteration": 2.731316328048706 }, { "auxiliary_loss_clip": 0.01108599, "auxiliary_loss_mlp": 0.01031757, "balance_loss_clip": 1.04695201, "balance_loss_mlp": 1.01501989, "epoch": 0.26502329776040884, "flos": 35116110927360.0, "grad_norm": 1.8896460113896294, "language_loss": 0.7597363, "learning_rate": 3.4480136259544084e-06, "loss": 0.78113985, "num_input_tokens_seen": 95307125, "step": 4408, "time_per_iteration": 2.8600730895996094 }, { "auxiliary_loss_clip": 0.01091919, "auxiliary_loss_mlp": 0.01044721, "balance_loss_clip": 1.04267502, "balance_loss_mlp": 1.02679181, "epoch": 0.2650834210130768, "flos": 38687498438400.0, "grad_norm": 1.7769050714437231, "language_loss": 0.70612216, "learning_rate": 3.447744950630084e-06, "loss": 0.72748852, "num_input_tokens_seen": 95329150, "step": 4409, "time_per_iteration": 2.936380386352539 }, { "auxiliary_loss_clip": 0.01131548, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.04774857, "balance_loss_mlp": 1.02218497, "epoch": 0.26514354426574477, "flos": 24716847951360.0, "grad_norm": 1.7357795205395667, "language_loss": 0.7337513, "learning_rate": 3.4474762204071253e-06, "loss": 0.75546867, "num_input_tokens_seen": 95349880, "step": 4410, "time_per_iteration": 2.7315077781677246 }, { "auxiliary_loss_clip": 0.01141374, "auxiliary_loss_mlp": 0.0104966, "balance_loss_clip": 1.05183268, "balance_loss_mlp": 1.03216028, "epoch": 0.26520366751841273, "flos": 20340055733760.0, "grad_norm": 1.8886288474708937, "language_loss": 0.73828322, "learning_rate": 3.4472074352957244e-06, "loss": 0.76019359, "num_input_tokens_seen": 95368570, "step": 4411, "time_per_iteration": 2.641920566558838 }, { "auxiliary_loss_clip": 0.01099594, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.04986739, "balance_loss_mlp": 1.02431464, "epoch": 0.2652637907710807, "flos": 22343870828160.0, "grad_norm": 1.9943391034693418, "language_loss": 0.82447588, "learning_rate": 3.446938595306071e-06, "loss": 0.84588754, "num_input_tokens_seen": 95387065, "step": 4412, "time_per_iteration": 2.8344247341156006 }, { "auxiliary_loss_clip": 0.01135402, "auxiliary_loss_mlp": 0.01052016, "balance_loss_clip": 1.05143464, "balance_loss_mlp": 1.03544593, "epoch": 0.26532391402374866, "flos": 19354235990400.0, "grad_norm": 1.775443234311944, "language_loss": 0.7446382, "learning_rate": 3.4466697004483622e-06, "loss": 0.76651239, "num_input_tokens_seen": 95406345, "step": 4413, "time_per_iteration": 2.657975196838379 }, { "auxiliary_loss_clip": 0.01056582, "auxiliary_loss_mlp": 0.01008584, "balance_loss_clip": 1.03258443, "balance_loss_mlp": 1.00659275, "epoch": 0.26538403727641663, "flos": 44787611422080.0, "grad_norm": 0.873557285042922, "language_loss": 0.56965125, "learning_rate": 3.446400750732793e-06, "loss": 0.59030288, "num_input_tokens_seen": 95463595, "step": 4414, "time_per_iteration": 3.1158244609832764 }, { "auxiliary_loss_clip": 0.01107803, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.04481411, "balance_loss_mlp": 1.03048313, "epoch": 0.26544416052908465, "flos": 28182119708160.0, "grad_norm": 1.5786807831647507, "language_loss": 0.74238014, "learning_rate": 3.4461317461695625e-06, "loss": 0.76392424, "num_input_tokens_seen": 95484115, "step": 4415, "time_per_iteration": 2.7223031520843506 }, { "auxiliary_loss_clip": 0.01095743, "auxiliary_loss_mlp": 0.01044325, "balance_loss_clip": 1.04215193, "balance_loss_mlp": 1.02402353, "epoch": 0.2655042837817526, "flos": 17565274097280.0, "grad_norm": 2.5102345694159016, "language_loss": 0.86855936, "learning_rate": 3.4458626867688707e-06, "loss": 0.88996005, "num_input_tokens_seen": 95501435, "step": 4416, "time_per_iteration": 2.7001683712005615 }, { "auxiliary_loss_clip": 0.01141467, "auxiliary_loss_mlp": 0.01046153, "balance_loss_clip": 1.05359149, "balance_loss_mlp": 1.02761602, "epoch": 0.2655644070344206, "flos": 23404636298880.0, "grad_norm": 1.6343137061510633, "language_loss": 0.76870787, "learning_rate": 3.4455935725409217e-06, "loss": 0.79058409, "num_input_tokens_seen": 95520135, "step": 4417, "time_per_iteration": 2.662196397781372 }, { "auxiliary_loss_clip": 0.01119441, "auxiliary_loss_mlp": 0.01041503, "balance_loss_clip": 1.04989183, "balance_loss_mlp": 1.02242982, "epoch": 0.26562453028708854, "flos": 26468462678400.0, "grad_norm": 1.6334113226277946, "language_loss": 0.80320108, "learning_rate": 3.4453244034959196e-06, "loss": 0.82481045, "num_input_tokens_seen": 95541705, "step": 4418, "time_per_iteration": 2.7742624282836914 }, { "auxiliary_loss_clip": 0.0113892, "auxiliary_loss_mlp": 0.01045476, "balance_loss_clip": 1.05182683, "balance_loss_mlp": 1.02721274, "epoch": 0.2656846535397565, "flos": 19207576759680.0, "grad_norm": 2.164903581235647, "language_loss": 0.67788607, "learning_rate": 3.445055179644071e-06, "loss": 0.69972998, "num_input_tokens_seen": 95560300, "step": 4419, "time_per_iteration": 2.6437718868255615 }, { "auxiliary_loss_clip": 0.01149692, "auxiliary_loss_mlp": 0.01046258, "balance_loss_clip": 1.05360699, "balance_loss_mlp": 1.02711296, "epoch": 0.2657447767924245, "flos": 30551325903360.0, "grad_norm": 1.9366129468869788, "language_loss": 0.79625547, "learning_rate": 3.444785900995585e-06, "loss": 0.81821501, "num_input_tokens_seen": 95580150, "step": 4420, "time_per_iteration": 2.6594905853271484 }, { "auxiliary_loss_clip": 0.01126984, "auxiliary_loss_mlp": 0.01053725, "balance_loss_clip": 1.05294895, "balance_loss_mlp": 1.03368592, "epoch": 0.26580490004509244, "flos": 20922742160640.0, "grad_norm": 1.9122536358412747, "language_loss": 0.81690109, "learning_rate": 3.444516567560673e-06, "loss": 0.83870822, "num_input_tokens_seen": 95597570, "step": 4421, "time_per_iteration": 2.681410551071167 }, { "auxiliary_loss_clip": 0.0113176, "auxiliary_loss_mlp": 0.01046737, "balance_loss_clip": 1.05015123, "balance_loss_mlp": 1.02904677, "epoch": 0.2658650232977604, "flos": 43945682584320.0, "grad_norm": 1.6112293393448585, "language_loss": 0.65704989, "learning_rate": 3.444247179349548e-06, "loss": 0.6788348, "num_input_tokens_seen": 95619415, "step": 4422, "time_per_iteration": 2.8766117095947266 }, { "auxiliary_loss_clip": 0.01130944, "auxiliary_loss_mlp": 0.01047224, "balance_loss_clip": 1.04903376, "balance_loss_mlp": 1.03039181, "epoch": 0.26592514655042837, "flos": 29716439109120.0, "grad_norm": 2.1017056533749896, "language_loss": 0.74229872, "learning_rate": 3.4439777363724252e-06, "loss": 0.76408041, "num_input_tokens_seen": 95639155, "step": 4423, "time_per_iteration": 2.6983659267425537 }, { "auxiliary_loss_clip": 0.01130559, "auxiliary_loss_mlp": 0.01057709, "balance_loss_clip": 1.04790974, "balance_loss_mlp": 1.03822982, "epoch": 0.26598526980309634, "flos": 46677730014720.0, "grad_norm": 1.6865310965149165, "language_loss": 0.77855694, "learning_rate": 3.443708238639522e-06, "loss": 0.80043966, "num_input_tokens_seen": 95663320, "step": 4424, "time_per_iteration": 2.900214433670044 }, { "auxiliary_loss_clip": 0.01132339, "auxiliary_loss_mlp": 0.01049395, "balance_loss_clip": 1.04963291, "balance_loss_mlp": 1.03181148, "epoch": 0.2660453930557643, "flos": 11509442582400.0, "grad_norm": 2.0755220631041684, "language_loss": 0.78940654, "learning_rate": 3.4434386861610573e-06, "loss": 0.81122386, "num_input_tokens_seen": 95680260, "step": 4425, "time_per_iteration": 2.6266820430755615 }, { "auxiliary_loss_clip": 0.01123867, "auxiliary_loss_mlp": 0.01043959, "balance_loss_clip": 1.05143404, "balance_loss_mlp": 1.02767467, "epoch": 0.26610551630843227, "flos": 24791578197120.0, "grad_norm": 1.5673316066045293, "language_loss": 0.80135047, "learning_rate": 3.4431690789472532e-06, "loss": 0.82302874, "num_input_tokens_seen": 95701140, "step": 4426, "time_per_iteration": 2.7015280723571777 }, { "auxiliary_loss_clip": 0.01150747, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.0554285, "balance_loss_mlp": 1.02678883, "epoch": 0.26616563956110023, "flos": 27636385397760.0, "grad_norm": 1.617839398314704, "language_loss": 0.77174348, "learning_rate": 3.442899417008333e-06, "loss": 0.79369569, "num_input_tokens_seen": 95722060, "step": 4427, "time_per_iteration": 2.6438984870910645 }, { "auxiliary_loss_clip": 0.01112968, "auxiliary_loss_mlp": 0.01037518, "balance_loss_clip": 1.05125654, "balance_loss_mlp": 1.02069747, "epoch": 0.26622576281376825, "flos": 28362893880960.0, "grad_norm": 1.5634759975385293, "language_loss": 0.76754683, "learning_rate": 3.4426297003545227e-06, "loss": 0.78905165, "num_input_tokens_seen": 95742495, "step": 4428, "time_per_iteration": 2.7695741653442383 }, { "auxiliary_loss_clip": 0.01114899, "auxiliary_loss_mlp": 0.00775922, "balance_loss_clip": 1.04922283, "balance_loss_mlp": 1.0008111, "epoch": 0.2662858860664362, "flos": 18041341979520.0, "grad_norm": 1.815928660217762, "language_loss": 0.82900071, "learning_rate": 3.4423599289960495e-06, "loss": 0.84790885, "num_input_tokens_seen": 95761510, "step": 4429, "time_per_iteration": 2.764183282852173 }, { "auxiliary_loss_clip": 0.01106492, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.05041027, "balance_loss_mlp": 1.02201009, "epoch": 0.2663460093191042, "flos": 22745818995840.0, "grad_norm": 1.6463341595476202, "language_loss": 0.71996218, "learning_rate": 3.442090102943143e-06, "loss": 0.74141741, "num_input_tokens_seen": 95782385, "step": 4430, "time_per_iteration": 2.7244491577148438 }, { "auxiliary_loss_clip": 0.01148257, "auxiliary_loss_mlp": 0.01049268, "balance_loss_clip": 1.05231071, "balance_loss_mlp": 1.03068352, "epoch": 0.26640613257177215, "flos": 16508782344960.0, "grad_norm": 1.9574919733512919, "language_loss": 0.82021642, "learning_rate": 3.441820222206035e-06, "loss": 0.84219164, "num_input_tokens_seen": 95800595, "step": 4431, "time_per_iteration": 2.5910067558288574 }, { "auxiliary_loss_clip": 0.01143334, "auxiliary_loss_mlp": 0.01050031, "balance_loss_clip": 1.0540812, "balance_loss_mlp": 1.03141046, "epoch": 0.2664662558244401, "flos": 23075945919360.0, "grad_norm": 2.074794485495937, "language_loss": 0.76745522, "learning_rate": 3.44155028679496e-06, "loss": 0.7893889, "num_input_tokens_seen": 95818480, "step": 4432, "time_per_iteration": 2.6548166275024414 }, { "auxiliary_loss_clip": 0.01089372, "auxiliary_loss_mlp": 0.01052807, "balance_loss_clip": 1.04526138, "balance_loss_mlp": 1.03232694, "epoch": 0.2665263790771081, "flos": 23769273214080.0, "grad_norm": 1.872584196626497, "language_loss": 0.82903433, "learning_rate": 3.441280296720154e-06, "loss": 0.85045612, "num_input_tokens_seen": 95837205, "step": 4433, "time_per_iteration": 4.2740867137908936 }, { "auxiliary_loss_clip": 0.01142798, "auxiliary_loss_mlp": 0.01045231, "balance_loss_clip": 1.05565643, "balance_loss_mlp": 1.02671802, "epoch": 0.26658650232977604, "flos": 28001273708160.0, "grad_norm": 2.548777168378285, "language_loss": 0.76308644, "learning_rate": 3.441010251991854e-06, "loss": 0.78496677, "num_input_tokens_seen": 95858395, "step": 4434, "time_per_iteration": 4.203384160995483 }, { "auxiliary_loss_clip": 0.0114611, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.05197668, "balance_loss_mlp": 1.02772319, "epoch": 0.266646625582444, "flos": 22163635359360.0, "grad_norm": 2.3452347637055393, "language_loss": 0.82496321, "learning_rate": 3.440740152620301e-06, "loss": 0.84687358, "num_input_tokens_seen": 95877875, "step": 4435, "time_per_iteration": 4.102782964706421 }, { "auxiliary_loss_clip": 0.01104916, "auxiliary_loss_mlp": 0.01062101, "balance_loss_clip": 1.04567468, "balance_loss_mlp": 1.04245555, "epoch": 0.266706748835112, "flos": 27853537069440.0, "grad_norm": 1.994258420562806, "language_loss": 0.87634504, "learning_rate": 3.4404699986157376e-06, "loss": 0.89801526, "num_input_tokens_seen": 95895820, "step": 4436, "time_per_iteration": 2.8048155307769775 }, { "auxiliary_loss_clip": 0.01121439, "auxiliary_loss_mlp": 0.01047617, "balance_loss_clip": 1.04637265, "balance_loss_mlp": 1.03054643, "epoch": 0.26676687208777994, "flos": 25812123413760.0, "grad_norm": 1.4763923958478316, "language_loss": 0.787242, "learning_rate": 3.440199789988407e-06, "loss": 0.80893254, "num_input_tokens_seen": 95918025, "step": 4437, "time_per_iteration": 2.7382607460021973 }, { "auxiliary_loss_clip": 0.01093686, "auxiliary_loss_mlp": 0.01048829, "balance_loss_clip": 1.05000877, "balance_loss_mlp": 1.03117394, "epoch": 0.2668269953404479, "flos": 36064583504640.0, "grad_norm": 4.5178491997969115, "language_loss": 0.63910848, "learning_rate": 3.439929526748556e-06, "loss": 0.66053367, "num_input_tokens_seen": 95937725, "step": 4438, "time_per_iteration": 2.956014633178711 }, { "auxiliary_loss_clip": 0.01080658, "auxiliary_loss_mlp": 0.01047394, "balance_loss_clip": 1.0432179, "balance_loss_mlp": 1.02994168, "epoch": 0.26688711859311587, "flos": 26570987072640.0, "grad_norm": 1.84569516037299, "language_loss": 0.75897747, "learning_rate": 3.4396592089064334e-06, "loss": 0.78025794, "num_input_tokens_seen": 95956335, "step": 4439, "time_per_iteration": 4.428173065185547 }, { "auxiliary_loss_clip": 0.01089075, "auxiliary_loss_mlp": 0.01041089, "balance_loss_clip": 1.04845262, "balance_loss_mlp": 1.02181315, "epoch": 0.26694724184578383, "flos": 26761565658240.0, "grad_norm": 2.10654378697334, "language_loss": 0.7172367, "learning_rate": 3.4393888364722897e-06, "loss": 0.73853838, "num_input_tokens_seen": 95977135, "step": 4440, "time_per_iteration": 2.9196605682373047 }, { "auxiliary_loss_clip": 0.01124038, "auxiliary_loss_mlp": 0.01049644, "balance_loss_clip": 1.04784775, "balance_loss_mlp": 1.02931881, "epoch": 0.2670073650984518, "flos": 20959586536320.0, "grad_norm": 1.869180757677473, "language_loss": 0.66229129, "learning_rate": 3.439118409456376e-06, "loss": 0.68402815, "num_input_tokens_seen": 95995435, "step": 4441, "time_per_iteration": 2.666428804397583 }, { "auxiliary_loss_clip": 0.01137041, "auxiliary_loss_mlp": 0.01049045, "balance_loss_clip": 1.04973912, "balance_loss_mlp": 1.02953053, "epoch": 0.2670674883511198, "flos": 28366054277760.0, "grad_norm": 3.888081439634283, "language_loss": 0.76102316, "learning_rate": 3.4388479278689486e-06, "loss": 0.78288412, "num_input_tokens_seen": 96016340, "step": 4442, "time_per_iteration": 2.6413686275482178 }, { "auxiliary_loss_clip": 0.0100646, "auxiliary_loss_mlp": 0.0105848, "balance_loss_clip": 1.02694619, "balance_loss_mlp": 1.05538034, "epoch": 0.2671276116037878, "flos": 58971319430400.0, "grad_norm": 0.9410220376713593, "language_loss": 0.61210632, "learning_rate": 3.4385773917202637e-06, "loss": 0.63275576, "num_input_tokens_seen": 96071205, "step": 4443, "time_per_iteration": 3.2342116832733154 }, { "auxiliary_loss_clip": 0.01123665, "auxiliary_loss_mlp": 0.01039982, "balance_loss_clip": 1.05413401, "balance_loss_mlp": 1.02239847, "epoch": 0.26718773485645575, "flos": 43945072053120.0, "grad_norm": 1.5620381861600383, "language_loss": 0.76195556, "learning_rate": 3.4383068010205793e-06, "loss": 0.78359205, "num_input_tokens_seen": 96094240, "step": 4444, "time_per_iteration": 3.136178731918335 }, { "auxiliary_loss_clip": 0.01142711, "auxiliary_loss_mlp": 0.01040756, "balance_loss_clip": 1.05331576, "balance_loss_mlp": 1.0213964, "epoch": 0.2672478581091237, "flos": 25228323665280.0, "grad_norm": 1.6750833182703528, "language_loss": 0.80892444, "learning_rate": 3.438036155780158e-06, "loss": 0.83075905, "num_input_tokens_seen": 96114105, "step": 4445, "time_per_iteration": 2.660952091217041 }, { "auxiliary_loss_clip": 0.01124381, "auxiliary_loss_mlp": 0.01048514, "balance_loss_clip": 1.05190587, "balance_loss_mlp": 1.02901077, "epoch": 0.2673079813617917, "flos": 15268176455040.0, "grad_norm": 2.1125172985353533, "language_loss": 0.89060926, "learning_rate": 3.43776545600926e-06, "loss": 0.9123382, "num_input_tokens_seen": 96132140, "step": 4446, "time_per_iteration": 2.6609115600585938 }, { "auxiliary_loss_clip": 0.011447, "auxiliary_loss_mlp": 0.01053132, "balance_loss_clip": 1.05528426, "balance_loss_mlp": 1.03541803, "epoch": 0.26736810461445965, "flos": 25812733944960.0, "grad_norm": 2.4310086382368783, "language_loss": 0.67756736, "learning_rate": 3.437494701718153e-06, "loss": 0.69954574, "num_input_tokens_seen": 96152090, "step": 4447, "time_per_iteration": 2.6696949005126953 }, { "auxiliary_loss_clip": 0.01144309, "auxiliary_loss_mlp": 0.0104489, "balance_loss_clip": 1.05496442, "balance_loss_mlp": 1.02572155, "epoch": 0.2674282278671276, "flos": 24312709054080.0, "grad_norm": 1.9687667134305082, "language_loss": 0.830899, "learning_rate": 3.4372238929171026e-06, "loss": 0.85279107, "num_input_tokens_seen": 96170015, "step": 4448, "time_per_iteration": 2.639463424682617 }, { "auxiliary_loss_clip": 0.0111564, "auxiliary_loss_mlp": 0.01054364, "balance_loss_clip": 1.05101895, "balance_loss_mlp": 1.03557646, "epoch": 0.2674883511197956, "flos": 22815521337600.0, "grad_norm": 1.479052407292424, "language_loss": 0.84231561, "learning_rate": 3.436953029616378e-06, "loss": 0.8640157, "num_input_tokens_seen": 96188065, "step": 4449, "time_per_iteration": 2.812290906906128 }, { "auxiliary_loss_clip": 0.0113237, "auxiliary_loss_mlp": 0.01055905, "balance_loss_clip": 1.05103493, "balance_loss_mlp": 1.03552055, "epoch": 0.26754847437246354, "flos": 25370170473600.0, "grad_norm": 1.7379167843341312, "language_loss": 0.84231997, "learning_rate": 3.4366821118262506e-06, "loss": 0.86420268, "num_input_tokens_seen": 96205780, "step": 4450, "time_per_iteration": 2.7598626613616943 }, { "auxiliary_loss_clip": 0.01109743, "auxiliary_loss_mlp": 0.01057779, "balance_loss_clip": 1.04833305, "balance_loss_mlp": 1.04044628, "epoch": 0.2676085976251315, "flos": 20230420446720.0, "grad_norm": 8.035146429526597, "language_loss": 0.80842566, "learning_rate": 3.4364111395569937e-06, "loss": 0.83010095, "num_input_tokens_seen": 96224990, "step": 4451, "time_per_iteration": 2.7467129230499268 }, { "auxiliary_loss_clip": 0.01141732, "auxiliary_loss_mlp": 0.01055516, "balance_loss_clip": 1.0553689, "balance_loss_mlp": 1.0379324, "epoch": 0.26766872087779947, "flos": 28038225824640.0, "grad_norm": 1.6378235408468254, "language_loss": 0.86285019, "learning_rate": 3.436140112818882e-06, "loss": 0.88482267, "num_input_tokens_seen": 96245345, "step": 4452, "time_per_iteration": 2.7442660331726074 }, { "auxiliary_loss_clip": 0.01134475, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.05496478, "balance_loss_mlp": 1.02926481, "epoch": 0.26772884413046744, "flos": 18325179250560.0, "grad_norm": 2.119384740597093, "language_loss": 0.83521158, "learning_rate": 3.435869031622194e-06, "loss": 0.85703623, "num_input_tokens_seen": 96259000, "step": 4453, "time_per_iteration": 2.659623146057129 }, { "auxiliary_loss_clip": 0.01141347, "auxiliary_loss_mlp": 0.01063496, "balance_loss_clip": 1.05624223, "balance_loss_mlp": 1.04485118, "epoch": 0.2677889673831354, "flos": 22127509255680.0, "grad_norm": 1.8460317519144305, "language_loss": 0.79565918, "learning_rate": 3.435597895977208e-06, "loss": 0.8177076, "num_input_tokens_seen": 96277000, "step": 4454, "time_per_iteration": 2.6458942890167236 }, { "auxiliary_loss_clip": 0.01130641, "auxiliary_loss_mlp": 0.01056871, "balance_loss_clip": 1.05338597, "balance_loss_mlp": 1.03869116, "epoch": 0.2678490906358034, "flos": 23729699404800.0, "grad_norm": 1.5255880946203295, "language_loss": 0.7241919, "learning_rate": 3.435326705894206e-06, "loss": 0.74606699, "num_input_tokens_seen": 96297010, "step": 4455, "time_per_iteration": 2.7328429222106934 }, { "auxiliary_loss_clip": 0.01112613, "auxiliary_loss_mlp": 0.01052208, "balance_loss_clip": 1.04858243, "balance_loss_mlp": 1.03508949, "epoch": 0.2679092138884714, "flos": 21762872340480.0, "grad_norm": 1.5657028408886426, "language_loss": 0.74017322, "learning_rate": 3.435055461383471e-06, "loss": 0.76182139, "num_input_tokens_seen": 96315780, "step": 4456, "time_per_iteration": 2.700190544128418 }, { "auxiliary_loss_clip": 0.0114232, "auxiliary_loss_mlp": 0.01048809, "balance_loss_clip": 1.05394006, "balance_loss_mlp": 1.03033149, "epoch": 0.26796933714113935, "flos": 19861186590720.0, "grad_norm": 2.4373070589767774, "language_loss": 0.70647967, "learning_rate": 3.4347841624552896e-06, "loss": 0.72839093, "num_input_tokens_seen": 96333465, "step": 4457, "time_per_iteration": 2.6334941387176514 }, { "auxiliary_loss_clip": 0.01112923, "auxiliary_loss_mlp": 0.01063608, "balance_loss_clip": 1.05205595, "balance_loss_mlp": 1.04513049, "epoch": 0.2680294603938073, "flos": 20047886507520.0, "grad_norm": 1.8228045543818674, "language_loss": 0.7903617, "learning_rate": 3.4345128091199493e-06, "loss": 0.81212699, "num_input_tokens_seen": 96352005, "step": 4458, "time_per_iteration": 2.7377572059631348 }, { "auxiliary_loss_clip": 0.01030327, "auxiliary_loss_mlp": 0.01043883, "balance_loss_clip": 1.0366354, "balance_loss_mlp": 1.0414269, "epoch": 0.2680895836464753, "flos": 72113763052800.0, "grad_norm": 0.9600198584891941, "language_loss": 0.58691025, "learning_rate": 3.434241401387739e-06, "loss": 0.60765231, "num_input_tokens_seen": 96406265, "step": 4459, "time_per_iteration": 3.2385354042053223 }, { "auxiliary_loss_clip": 0.0108842, "auxiliary_loss_mlp": 0.01056025, "balance_loss_clip": 1.04306948, "balance_loss_mlp": 1.0379889, "epoch": 0.26814970689914325, "flos": 20449044576000.0, "grad_norm": 2.1196386888642382, "language_loss": 0.84988648, "learning_rate": 3.4339699392689507e-06, "loss": 0.87133086, "num_input_tokens_seen": 96425225, "step": 4460, "time_per_iteration": 2.767054319381714 }, { "auxiliary_loss_clip": 0.01134128, "auxiliary_loss_mlp": 0.01059054, "balance_loss_clip": 1.0525527, "balance_loss_mlp": 1.03916979, "epoch": 0.2682098301518112, "flos": 17566674727680.0, "grad_norm": 1.6839260392555548, "language_loss": 0.68334675, "learning_rate": 3.4336984227738796e-06, "loss": 0.70527858, "num_input_tokens_seen": 96443780, "step": 4461, "time_per_iteration": 2.7217342853546143 }, { "auxiliary_loss_clip": 0.0111525, "auxiliary_loss_mlp": 0.01054739, "balance_loss_clip": 1.05045152, "balance_loss_mlp": 1.03649962, "epoch": 0.2682699534044792, "flos": 18333259810560.0, "grad_norm": 1.7146103847032579, "language_loss": 0.67240328, "learning_rate": 3.43342685191282e-06, "loss": 0.69410318, "num_input_tokens_seen": 96464530, "step": 4462, "time_per_iteration": 2.730682134628296 }, { "auxiliary_loss_clip": 0.01116667, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.05230319, "balance_loss_mlp": 1.02710128, "epoch": 0.26833007665714714, "flos": 25301294144640.0, "grad_norm": 1.7796857642272712, "language_loss": 0.69503593, "learning_rate": 3.4331552266960705e-06, "loss": 0.71666932, "num_input_tokens_seen": 96483345, "step": 4463, "time_per_iteration": 2.738046407699585 }, { "auxiliary_loss_clip": 0.01118676, "auxiliary_loss_mlp": 0.01049589, "balance_loss_clip": 1.0492326, "balance_loss_mlp": 1.02862048, "epoch": 0.2683901999098151, "flos": 16099759198080.0, "grad_norm": 2.5866232358274277, "language_loss": 0.77943784, "learning_rate": 3.432883547133931e-06, "loss": 0.80112046, "num_input_tokens_seen": 96498305, "step": 4464, "time_per_iteration": 2.6794681549072266 }, { "auxiliary_loss_clip": 0.01133564, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.05244994, "balance_loss_mlp": 1.02410388, "epoch": 0.2684503231624831, "flos": 27308054154240.0, "grad_norm": 2.2986867036088285, "language_loss": 0.71375966, "learning_rate": 3.432611813236704e-06, "loss": 0.73552406, "num_input_tokens_seen": 96519740, "step": 4465, "time_per_iteration": 2.699575662612915 }, { "auxiliary_loss_clip": 0.01042347, "auxiliary_loss_mlp": 0.01001834, "balance_loss_clip": 1.02813911, "balance_loss_mlp": 0.9993788, "epoch": 0.26851044641515104, "flos": 71858007239040.0, "grad_norm": 0.7242654721351415, "language_loss": 0.53150702, "learning_rate": 3.4323400250146943e-06, "loss": 0.5519489, "num_input_tokens_seen": 96588870, "step": 4466, "time_per_iteration": 3.3984062671661377 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.0105552, "balance_loss_clip": 1.04732478, "balance_loss_mlp": 1.03381157, "epoch": 0.268570569667819, "flos": 18733771434240.0, "grad_norm": 2.1738333593055796, "language_loss": 0.74038142, "learning_rate": 3.4320681824782057e-06, "loss": 0.76212335, "num_input_tokens_seen": 96605100, "step": 4467, "time_per_iteration": 2.6631343364715576 }, { "auxiliary_loss_clip": 0.01126618, "auxiliary_loss_mlp": 0.00777879, "balance_loss_clip": 1.05088973, "balance_loss_mlp": 1.00093102, "epoch": 0.268630692920487, "flos": 18178376365440.0, "grad_norm": 3.586661477808892, "language_loss": 0.80481976, "learning_rate": 3.4317962856375493e-06, "loss": 0.82386476, "num_input_tokens_seen": 96621410, "step": 4468, "time_per_iteration": 2.64806866645813 }, { "auxiliary_loss_clip": 0.01059326, "auxiliary_loss_mlp": 0.01006331, "balance_loss_clip": 1.02527809, "balance_loss_mlp": 1.0036248, "epoch": 0.268690816173155, "flos": 68731768978560.0, "grad_norm": 0.8399316740346766, "language_loss": 0.59498715, "learning_rate": 3.4315243345030334e-06, "loss": 0.61564374, "num_input_tokens_seen": 96684810, "step": 4469, "time_per_iteration": 3.1989517211914062 }, { "auxiliary_loss_clip": 0.01156531, "auxiliary_loss_mlp": 0.01048741, "balance_loss_clip": 1.05689096, "balance_loss_mlp": 1.02854705, "epoch": 0.26875093942582295, "flos": 23293636295040.0, "grad_norm": 2.165956170420043, "language_loss": 0.82055074, "learning_rate": 3.431252329084972e-06, "loss": 0.84260345, "num_input_tokens_seen": 96701920, "step": 4470, "time_per_iteration": 2.6167352199554443 }, { "auxiliary_loss_clip": 0.01117064, "auxiliary_loss_mlp": 0.01054605, "balance_loss_clip": 1.04794455, "balance_loss_mlp": 1.03563929, "epoch": 0.2688110626784909, "flos": 21543458112000.0, "grad_norm": 1.6543166375172473, "language_loss": 0.82841349, "learning_rate": 3.4309802693936786e-06, "loss": 0.8501302, "num_input_tokens_seen": 96721260, "step": 4471, "time_per_iteration": 4.177881956100464 }, { "auxiliary_loss_clip": 0.01133274, "auxiliary_loss_mlp": 0.01045934, "balance_loss_clip": 1.05339766, "balance_loss_mlp": 1.02762365, "epoch": 0.2688711859311589, "flos": 28400600183040.0, "grad_norm": 2.017001756898941, "language_loss": 0.69309431, "learning_rate": 3.43070815543947e-06, "loss": 0.71488637, "num_input_tokens_seen": 96740385, "step": 4472, "time_per_iteration": 2.6611149311065674 }, { "auxiliary_loss_clip": 0.01150636, "auxiliary_loss_mlp": 0.01046679, "balance_loss_clip": 1.05448234, "balance_loss_mlp": 1.02882099, "epoch": 0.26893130918382685, "flos": 25994944661760.0, "grad_norm": 1.889152474147147, "language_loss": 0.67809618, "learning_rate": 3.4304359872326656e-06, "loss": 0.70006931, "num_input_tokens_seen": 96761860, "step": 4473, "time_per_iteration": 2.6570448875427246 }, { "auxiliary_loss_clip": 0.01123821, "auxiliary_loss_mlp": 0.01056077, "balance_loss_clip": 1.05778623, "balance_loss_mlp": 1.03800452, "epoch": 0.2689914324364948, "flos": 20339624770560.0, "grad_norm": 2.20378943201051, "language_loss": 0.82835853, "learning_rate": 3.4301637647835843e-06, "loss": 0.8501575, "num_input_tokens_seen": 96781890, "step": 4474, "time_per_iteration": 5.79376220703125 }, { "auxiliary_loss_clip": 0.01138349, "auxiliary_loss_mlp": 0.01055982, "balance_loss_clip": 1.05353034, "balance_loss_mlp": 1.03841054, "epoch": 0.2690515556891628, "flos": 19464553635840.0, "grad_norm": 2.404484364093812, "language_loss": 0.71004206, "learning_rate": 3.4298914881025494e-06, "loss": 0.73198539, "num_input_tokens_seen": 96800390, "step": 4475, "time_per_iteration": 2.5969674587249756 }, { "auxiliary_loss_clip": 0.01112288, "auxiliary_loss_mlp": 0.00776382, "balance_loss_clip": 1.05001771, "balance_loss_mlp": 1.00081563, "epoch": 0.26911167894183075, "flos": 18146631720960.0, "grad_norm": 1.8574153972172647, "language_loss": 0.73638999, "learning_rate": 3.4296191571998863e-06, "loss": 0.75527668, "num_input_tokens_seen": 96816685, "step": 4476, "time_per_iteration": 2.70358943939209 }, { "auxiliary_loss_clip": 0.01119256, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.05050373, "balance_loss_mlp": 1.02605665, "epoch": 0.2691718021944987, "flos": 19975131509760.0, "grad_norm": 1.5040704863343832, "language_loss": 0.80439913, "learning_rate": 3.429346772085922e-06, "loss": 0.82602954, "num_input_tokens_seen": 96836285, "step": 4477, "time_per_iteration": 4.313180208206177 }, { "auxiliary_loss_clip": 0.01097359, "auxiliary_loss_mlp": 0.0104976, "balance_loss_clip": 1.04965031, "balance_loss_mlp": 1.0309844, "epoch": 0.2692319254471667, "flos": 37447215770880.0, "grad_norm": 1.7971929656919947, "language_loss": 0.65181434, "learning_rate": 3.429074332770984e-06, "loss": 0.67328548, "num_input_tokens_seen": 96857745, "step": 4478, "time_per_iteration": 2.8882603645324707 }, { "auxiliary_loss_clip": 0.01130488, "auxiliary_loss_mlp": 0.01050401, "balance_loss_clip": 1.04841042, "balance_loss_mlp": 1.03163743, "epoch": 0.26929204869983464, "flos": 22127796564480.0, "grad_norm": 1.933707281531851, "language_loss": 0.80987537, "learning_rate": 3.4288018392654047e-06, "loss": 0.83168429, "num_input_tokens_seen": 96877295, "step": 4479, "time_per_iteration": 2.670370578765869 }, { "auxiliary_loss_clip": 0.01127626, "auxiliary_loss_mlp": 0.00776143, "balance_loss_clip": 1.05010593, "balance_loss_mlp": 1.0010041, "epoch": 0.2693521719525026, "flos": 19792813052160.0, "grad_norm": 16.364114673072947, "language_loss": 0.81205857, "learning_rate": 3.4285292915795166e-06, "loss": 0.83109629, "num_input_tokens_seen": 96896160, "step": 4480, "time_per_iteration": 2.687922954559326 }, { "auxiliary_loss_clip": 0.01098242, "auxiliary_loss_mlp": 0.01051142, "balance_loss_clip": 1.04720628, "balance_loss_mlp": 1.03243792, "epoch": 0.2694122952051706, "flos": 20994383836800.0, "grad_norm": 1.5167677573266813, "language_loss": 0.77982032, "learning_rate": 3.4282566897236543e-06, "loss": 0.80131412, "num_input_tokens_seen": 96915410, "step": 4481, "time_per_iteration": 2.783400058746338 }, { "auxiliary_loss_clip": 0.01138325, "auxiliary_loss_mlp": 0.01055373, "balance_loss_clip": 1.05098486, "balance_loss_mlp": 1.03693104, "epoch": 0.2694724184578386, "flos": 25849291011840.0, "grad_norm": 1.817845708033507, "language_loss": 0.74072635, "learning_rate": 3.4279840337081547e-06, "loss": 0.76266336, "num_input_tokens_seen": 96937865, "step": 4482, "time_per_iteration": 2.704923629760742 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01046467, "balance_loss_clip": 1.05258846, "balance_loss_mlp": 1.02826333, "epoch": 0.26953254171050656, "flos": 21726961718400.0, "grad_norm": 2.016330221700464, "language_loss": 0.72562164, "learning_rate": 3.4277113235433584e-06, "loss": 0.74733007, "num_input_tokens_seen": 96957710, "step": 4483, "time_per_iteration": 2.697889804840088 }, { "auxiliary_loss_clip": 0.0113896, "auxiliary_loss_mlp": 0.01056121, "balance_loss_clip": 1.04867983, "balance_loss_mlp": 1.03658295, "epoch": 0.2695926649631745, "flos": 19682926369920.0, "grad_norm": 2.3663265895203356, "language_loss": 0.86904967, "learning_rate": 3.427438559239605e-06, "loss": 0.89100051, "num_input_tokens_seen": 96975890, "step": 4484, "time_per_iteration": 2.6893441677093506 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01049025, "balance_loss_clip": 1.05224931, "balance_loss_mlp": 1.03148949, "epoch": 0.2696527882158425, "flos": 32886596724480.0, "grad_norm": 1.783447205979712, "language_loss": 0.6663093, "learning_rate": 3.427165740807239e-06, "loss": 0.68819648, "num_input_tokens_seen": 96998595, "step": 4485, "time_per_iteration": 2.795172929763794 }, { "auxiliary_loss_clip": 0.01112833, "auxiliary_loss_mlp": 0.01053324, "balance_loss_clip": 1.04507363, "balance_loss_mlp": 1.03475094, "epoch": 0.26971291146851045, "flos": 12124843320960.0, "grad_norm": 2.5437851063433743, "language_loss": 0.73155308, "learning_rate": 3.426892868256604e-06, "loss": 0.75321472, "num_input_tokens_seen": 97013715, "step": 4486, "time_per_iteration": 2.6854116916656494 }, { "auxiliary_loss_clip": 0.01156209, "auxiliary_loss_mlp": 0.01047906, "balance_loss_clip": 1.05688012, "balance_loss_mlp": 1.03062034, "epoch": 0.2697730347211784, "flos": 22634459856000.0, "grad_norm": 2.2389379935408456, "language_loss": 0.84326887, "learning_rate": 3.4266199415980495e-06, "loss": 0.86531007, "num_input_tokens_seen": 97031570, "step": 4487, "time_per_iteration": 2.6117801666259766 }, { "auxiliary_loss_clip": 0.01127332, "auxiliary_loss_mlp": 0.0105083, "balance_loss_clip": 1.05733204, "balance_loss_mlp": 1.03228104, "epoch": 0.2698331579738464, "flos": 23513050523520.0, "grad_norm": 2.345170862120161, "language_loss": 0.7189706, "learning_rate": 3.4263469608419234e-06, "loss": 0.74075222, "num_input_tokens_seen": 97049815, "step": 4488, "time_per_iteration": 2.7384660243988037 }, { "auxiliary_loss_clip": 0.01074601, "auxiliary_loss_mlp": 0.01061378, "balance_loss_clip": 1.0494225, "balance_loss_mlp": 1.04040885, "epoch": 0.26989328122651435, "flos": 24641040297600.0, "grad_norm": 1.6359957516545125, "language_loss": 0.83725536, "learning_rate": 3.426073925998578e-06, "loss": 0.85861516, "num_input_tokens_seen": 97067570, "step": 4489, "time_per_iteration": 2.9274613857269287 }, { "auxiliary_loss_clip": 0.01129648, "auxiliary_loss_mlp": 0.01061235, "balance_loss_clip": 1.05630314, "balance_loss_mlp": 1.04203057, "epoch": 0.2699534044791823, "flos": 10772555068800.0, "grad_norm": 2.6678463269995785, "language_loss": 0.90056908, "learning_rate": 3.4258008370783656e-06, "loss": 0.9224779, "num_input_tokens_seen": 97082180, "step": 4490, "time_per_iteration": 2.9096486568450928 }, { "auxiliary_loss_clip": 0.01075397, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.04493999, "balance_loss_mlp": 1.03319883, "epoch": 0.2700135277318503, "flos": 36171597098880.0, "grad_norm": 2.0876908666200573, "language_loss": 0.73380542, "learning_rate": 3.4255276940916434e-06, "loss": 0.75508606, "num_input_tokens_seen": 97103470, "step": 4491, "time_per_iteration": 2.9016802310943604 }, { "auxiliary_loss_clip": 0.01156852, "auxiliary_loss_mlp": 0.01052294, "balance_loss_clip": 1.05944943, "balance_loss_mlp": 1.03453195, "epoch": 0.27007365098451824, "flos": 17418614866560.0, "grad_norm": 2.7575700534068783, "language_loss": 0.74795783, "learning_rate": 3.4252544970487676e-06, "loss": 0.77004933, "num_input_tokens_seen": 97118100, "step": 4492, "time_per_iteration": 2.6685187816619873 }, { "auxiliary_loss_clip": 0.01130467, "auxiliary_loss_mlp": 0.01050253, "balance_loss_clip": 1.05300546, "balance_loss_mlp": 1.03205013, "epoch": 0.2701337742371862, "flos": 23185688947200.0, "grad_norm": 3.551039047250381, "language_loss": 0.89015245, "learning_rate": 3.4249812459600986e-06, "loss": 0.91195965, "num_input_tokens_seen": 97136765, "step": 4493, "time_per_iteration": 2.7044742107391357 }, { "auxiliary_loss_clip": 0.01142037, "auxiliary_loss_mlp": 0.0104825, "balance_loss_clip": 1.05408192, "balance_loss_mlp": 1.03079772, "epoch": 0.2701938974898542, "flos": 24389450461440.0, "grad_norm": 1.665337194117132, "language_loss": 0.71139705, "learning_rate": 3.424707940835998e-06, "loss": 0.73329991, "num_input_tokens_seen": 97157470, "step": 4494, "time_per_iteration": 2.6299519538879395 }, { "auxiliary_loss_clip": 0.01120214, "auxiliary_loss_mlp": 0.01045805, "balance_loss_clip": 1.05193532, "balance_loss_mlp": 1.02893662, "epoch": 0.2702540207425222, "flos": 26214322976640.0, "grad_norm": 2.4718809008283045, "language_loss": 0.8642354, "learning_rate": 3.42443458168683e-06, "loss": 0.88589561, "num_input_tokens_seen": 97176905, "step": 4495, "time_per_iteration": 2.627389907836914 }, { "auxiliary_loss_clip": 0.01151814, "auxiliary_loss_mlp": 0.0105053, "balance_loss_clip": 1.05591631, "balance_loss_mlp": 1.03308964, "epoch": 0.27031414399519016, "flos": 22926377687040.0, "grad_norm": 2.1521214825296844, "language_loss": 0.76781964, "learning_rate": 3.424161168522959e-06, "loss": 0.78984308, "num_input_tokens_seen": 97196380, "step": 4496, "time_per_iteration": 2.5360703468322754 }, { "auxiliary_loss_clip": 0.01064272, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.03151321, "balance_loss_mlp": 1.04716671, "epoch": 0.2703742672478581, "flos": 63019780404480.0, "grad_norm": 0.7153442156657138, "language_loss": 0.50134224, "learning_rate": 3.423887701354754e-06, "loss": 0.52248067, "num_input_tokens_seen": 97260100, "step": 4497, "time_per_iteration": 3.1133949756622314 }, { "auxiliary_loss_clip": 0.01106563, "auxiliary_loss_mlp": 0.01051954, "balance_loss_clip": 1.05492568, "balance_loss_mlp": 1.03482318, "epoch": 0.2704343905005261, "flos": 18840820942080.0, "grad_norm": 2.421164292554959, "language_loss": 0.72386497, "learning_rate": 3.4236141801925847e-06, "loss": 0.74545014, "num_input_tokens_seen": 97277935, "step": 4498, "time_per_iteration": 2.7409775257110596 }, { "auxiliary_loss_clip": 0.01038432, "auxiliary_loss_mlp": 0.01028244, "balance_loss_clip": 1.0322926, "balance_loss_mlp": 1.02582395, "epoch": 0.27049451375319405, "flos": 71233412618880.0, "grad_norm": 0.7537228186848703, "language_loss": 0.5917033, "learning_rate": 3.4233406050468237e-06, "loss": 0.61237001, "num_input_tokens_seen": 97338845, "step": 4499, "time_per_iteration": 3.2331602573394775 }, { "auxiliary_loss_clip": 0.01124574, "auxiliary_loss_mlp": 0.01044613, "balance_loss_clip": 1.05154204, "balance_loss_mlp": 1.02593243, "epoch": 0.270554637005862, "flos": 24278594112000.0, "grad_norm": 2.1159538878254756, "language_loss": 0.73629957, "learning_rate": 3.4230669759278438e-06, "loss": 0.75799143, "num_input_tokens_seen": 97356640, "step": 4500, "time_per_iteration": 2.7513487339019775 }, { "auxiliary_loss_clip": 0.01116688, "auxiliary_loss_mlp": 0.01047016, "balance_loss_clip": 1.04657793, "balance_loss_mlp": 1.02878881, "epoch": 0.27061476025853, "flos": 17632318832640.0, "grad_norm": 2.8997006330289925, "language_loss": 0.81041664, "learning_rate": 3.4227932928460215e-06, "loss": 0.83205366, "num_input_tokens_seen": 97372585, "step": 4501, "time_per_iteration": 2.703014850616455 }, { "auxiliary_loss_clip": 0.01104056, "auxiliary_loss_mlp": 0.01053779, "balance_loss_clip": 1.04828477, "balance_loss_mlp": 1.03331053, "epoch": 0.27067488351119795, "flos": 22710123855360.0, "grad_norm": 4.2139696132912565, "language_loss": 0.7261312, "learning_rate": 3.422519555811735e-06, "loss": 0.74770957, "num_input_tokens_seen": 97393315, "step": 4502, "time_per_iteration": 2.732167959213257 }, { "auxiliary_loss_clip": 0.01129704, "auxiliary_loss_mlp": 0.01047167, "balance_loss_clip": 1.04821455, "balance_loss_mlp": 1.0268774, "epoch": 0.2707350067638659, "flos": 41719616087040.0, "grad_norm": 1.748421457410976, "language_loss": 0.67973912, "learning_rate": 3.4222457648353642e-06, "loss": 0.70150787, "num_input_tokens_seen": 97417860, "step": 4503, "time_per_iteration": 2.7950186729431152 }, { "auxiliary_loss_clip": 0.01100008, "auxiliary_loss_mlp": 0.01051668, "balance_loss_clip": 1.04750037, "balance_loss_mlp": 1.03180754, "epoch": 0.2707951300165339, "flos": 20193037367040.0, "grad_norm": 1.847411158173202, "language_loss": 0.67971921, "learning_rate": 3.4219719199272918e-06, "loss": 0.70123595, "num_input_tokens_seen": 97436780, "step": 4504, "time_per_iteration": 2.7830374240875244 }, { "auxiliary_loss_clip": 0.01142201, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.05604792, "balance_loss_mlp": 1.03451371, "epoch": 0.27085525326920185, "flos": 21433966479360.0, "grad_norm": 1.4870002594081857, "language_loss": 0.75395846, "learning_rate": 3.421698021097902e-06, "loss": 0.77590245, "num_input_tokens_seen": 97456190, "step": 4505, "time_per_iteration": 2.6758666038513184 }, { "auxiliary_loss_clip": 0.01155407, "auxiliary_loss_mlp": 0.01064618, "balance_loss_clip": 1.05439496, "balance_loss_mlp": 1.04436409, "epoch": 0.2709153765218698, "flos": 17675232606720.0, "grad_norm": 2.0635482699578254, "language_loss": 0.73474276, "learning_rate": 3.42142406835758e-06, "loss": 0.75694299, "num_input_tokens_seen": 97474545, "step": 4506, "time_per_iteration": 2.652395009994507 }, { "auxiliary_loss_clip": 0.01130629, "auxiliary_loss_mlp": 0.01053462, "balance_loss_clip": 1.05147469, "balance_loss_mlp": 1.0338285, "epoch": 0.2709754997745378, "flos": 24456243801600.0, "grad_norm": 2.6352592870517144, "language_loss": 0.80730569, "learning_rate": 3.421150061716715e-06, "loss": 0.82914662, "num_input_tokens_seen": 97494520, "step": 4507, "time_per_iteration": 2.7858307361602783 }, { "auxiliary_loss_clip": 0.01041671, "auxiliary_loss_mlp": 0.010698, "balance_loss_clip": 1.0261147, "balance_loss_mlp": 1.0667243, "epoch": 0.2710356230272058, "flos": 65210798206080.0, "grad_norm": 0.7655673562950965, "language_loss": 0.5085085, "learning_rate": 3.420876001185698e-06, "loss": 0.52962321, "num_input_tokens_seen": 97552455, "step": 4508, "time_per_iteration": 3.144418716430664 }, { "auxiliary_loss_clip": 0.01072779, "auxiliary_loss_mlp": 0.01046589, "balance_loss_clip": 1.04359698, "balance_loss_mlp": 1.02843356, "epoch": 0.27109574627987376, "flos": 25484438615040.0, "grad_norm": 1.9710162430227722, "language_loss": 0.74710357, "learning_rate": 3.4206018867749197e-06, "loss": 0.76829731, "num_input_tokens_seen": 97572650, "step": 4509, "time_per_iteration": 2.8052053451538086 }, { "auxiliary_loss_clip": 0.01130819, "auxiliary_loss_mlp": 0.01042284, "balance_loss_clip": 1.05107474, "balance_loss_mlp": 1.0254159, "epoch": 0.2711558695325417, "flos": 19682782715520.0, "grad_norm": 2.0468089657674353, "language_loss": 0.70937192, "learning_rate": 3.4203277184947757e-06, "loss": 0.73110294, "num_input_tokens_seen": 97591150, "step": 4510, "time_per_iteration": 2.6244139671325684 }, { "auxiliary_loss_clip": 0.01135912, "auxiliary_loss_mlp": 0.0103914, "balance_loss_clip": 1.05330467, "balance_loss_mlp": 1.02156901, "epoch": 0.2712159927852097, "flos": 18587758648320.0, "grad_norm": 2.4701723872261256, "language_loss": 0.70409644, "learning_rate": 3.4200534963556627e-06, "loss": 0.72584701, "num_input_tokens_seen": 97607410, "step": 4511, "time_per_iteration": 4.112820863723755 }, { "auxiliary_loss_clip": 0.0112023, "auxiliary_loss_mlp": 0.01049105, "balance_loss_clip": 1.048491, "balance_loss_mlp": 1.03115225, "epoch": 0.27127611603787766, "flos": 25630235919360.0, "grad_norm": 6.028868725677894, "language_loss": 0.81324005, "learning_rate": 3.419779220367979e-06, "loss": 0.83493352, "num_input_tokens_seen": 97626870, "step": 4512, "time_per_iteration": 4.285844087600708 }, { "auxiliary_loss_clip": 0.01147816, "auxiliary_loss_mlp": 0.01038614, "balance_loss_clip": 1.05365086, "balance_loss_mlp": 1.02323616, "epoch": 0.2713362392905456, "flos": 23148952312320.0, "grad_norm": 2.7707983308205053, "language_loss": 0.80467856, "learning_rate": 3.419504890542124e-06, "loss": 0.82654285, "num_input_tokens_seen": 97646595, "step": 4513, "time_per_iteration": 4.415290117263794 }, { "auxiliary_loss_clip": 0.01119685, "auxiliary_loss_mlp": 0.01044412, "balance_loss_clip": 1.04594898, "balance_loss_mlp": 1.02709103, "epoch": 0.2713963625432136, "flos": 18366045949440.0, "grad_norm": 1.8005970142501413, "language_loss": 0.88150048, "learning_rate": 3.4192305068885026e-06, "loss": 0.90314144, "num_input_tokens_seen": 97665485, "step": 4514, "time_per_iteration": 2.691697835922241 }, { "auxiliary_loss_clip": 0.01129072, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.05358005, "balance_loss_mlp": 1.03337574, "epoch": 0.27145648579588155, "flos": 22491751121280.0, "grad_norm": 1.6419144417830658, "language_loss": 0.91461927, "learning_rate": 3.418956069417517e-06, "loss": 0.93642819, "num_input_tokens_seen": 97683800, "step": 4515, "time_per_iteration": 2.6709890365600586 }, { "auxiliary_loss_clip": 0.01100451, "auxiliary_loss_mlp": 0.01057835, "balance_loss_clip": 1.04920852, "balance_loss_mlp": 1.03761721, "epoch": 0.2715166090485495, "flos": 19239177749760.0, "grad_norm": 2.0250040358395944, "language_loss": 0.74093282, "learning_rate": 3.4186815781395756e-06, "loss": 0.76251566, "num_input_tokens_seen": 97700505, "step": 4516, "time_per_iteration": 2.7001607418060303 }, { "auxiliary_loss_clip": 0.01136738, "auxiliary_loss_mlp": 0.01052795, "balance_loss_clip": 1.05046439, "balance_loss_mlp": 1.03483033, "epoch": 0.2715767323012175, "flos": 17709598944000.0, "grad_norm": 2.811509606055916, "language_loss": 0.75989574, "learning_rate": 3.4184070330650866e-06, "loss": 0.78179109, "num_input_tokens_seen": 97717410, "step": 4517, "time_per_iteration": 4.207966089248657 }, { "auxiliary_loss_clip": 0.01097642, "auxiliary_loss_mlp": 0.01058771, "balance_loss_clip": 1.04378986, "balance_loss_mlp": 1.03962636, "epoch": 0.27163685555388545, "flos": 22382834106240.0, "grad_norm": 2.3161178488466097, "language_loss": 0.77046895, "learning_rate": 3.4181324342044607e-06, "loss": 0.79203308, "num_input_tokens_seen": 97734545, "step": 4518, "time_per_iteration": 2.754009246826172 }, { "auxiliary_loss_clip": 0.01118909, "auxiliary_loss_mlp": 0.01047823, "balance_loss_clip": 1.05136919, "balance_loss_mlp": 1.03077579, "epoch": 0.2716969788065534, "flos": 22346708002560.0, "grad_norm": 2.717268994046331, "language_loss": 0.68388188, "learning_rate": 3.41785778156811e-06, "loss": 0.70554924, "num_input_tokens_seen": 97754000, "step": 4519, "time_per_iteration": 2.7800872325897217 }, { "auxiliary_loss_clip": 0.01134075, "auxiliary_loss_mlp": 0.01053278, "balance_loss_clip": 1.05009973, "balance_loss_mlp": 1.03611171, "epoch": 0.2717571020592214, "flos": 25228467319680.0, "grad_norm": 2.367483937305651, "language_loss": 0.75572526, "learning_rate": 3.417583075166451e-06, "loss": 0.7775988, "num_input_tokens_seen": 97772080, "step": 4520, "time_per_iteration": 2.694591760635376 }, { "auxiliary_loss_clip": 0.01138275, "auxiliary_loss_mlp": 0.0106095, "balance_loss_clip": 1.05209494, "balance_loss_mlp": 1.04226971, "epoch": 0.2718172253118894, "flos": 20189769229440.0, "grad_norm": 3.3698654303080935, "language_loss": 0.76434267, "learning_rate": 3.4173083150099e-06, "loss": 0.78633487, "num_input_tokens_seen": 97789370, "step": 4521, "time_per_iteration": 2.675443649291992 }, { "auxiliary_loss_clip": 0.01117262, "auxiliary_loss_mlp": 0.0106414, "balance_loss_clip": 1.04636955, "balance_loss_mlp": 1.04578209, "epoch": 0.27187734856455736, "flos": 14319129260160.0, "grad_norm": 2.1933848209734936, "language_loss": 0.75041616, "learning_rate": 3.417033501108875e-06, "loss": 0.77223015, "num_input_tokens_seen": 97807385, "step": 4522, "time_per_iteration": 2.769519329071045 }, { "auxiliary_loss_clip": 0.01151707, "auxiliary_loss_mlp": 0.01045506, "balance_loss_clip": 1.05433989, "balance_loss_mlp": 1.02813768, "epoch": 0.27193747181722533, "flos": 21107682311040.0, "grad_norm": 1.9328965147806931, "language_loss": 0.73074079, "learning_rate": 3.416758633473798e-06, "loss": 0.75271285, "num_input_tokens_seen": 97827930, "step": 4523, "time_per_iteration": 2.6642134189605713 }, { "auxiliary_loss_clip": 0.01120278, "auxiliary_loss_mlp": 0.01048373, "balance_loss_clip": 1.05034256, "balance_loss_mlp": 1.03014588, "epoch": 0.2719975950698933, "flos": 19682782715520.0, "grad_norm": 1.3899676528871532, "language_loss": 0.74113363, "learning_rate": 3.4164837121150915e-06, "loss": 0.76282012, "num_input_tokens_seen": 97847440, "step": 4524, "time_per_iteration": 2.6365647315979004 }, { "auxiliary_loss_clip": 0.0115251, "auxiliary_loss_mlp": 0.01059779, "balance_loss_clip": 1.05642283, "balance_loss_mlp": 1.04233861, "epoch": 0.27205771832256126, "flos": 24754482426240.0, "grad_norm": 1.6567279945506783, "language_loss": 0.7639389, "learning_rate": 3.4162087370431803e-06, "loss": 0.78606176, "num_input_tokens_seen": 97867620, "step": 4525, "time_per_iteration": 2.7116904258728027 }, { "auxiliary_loss_clip": 0.01133976, "auxiliary_loss_mlp": 0.01063183, "balance_loss_clip": 1.05110538, "balance_loss_mlp": 1.0458858, "epoch": 0.2721178415752292, "flos": 21755581879680.0, "grad_norm": 1.8049087044415455, "language_loss": 0.81449121, "learning_rate": 3.4159337082684926e-06, "loss": 0.8364628, "num_input_tokens_seen": 97884345, "step": 4526, "time_per_iteration": 2.583151340484619 }, { "auxiliary_loss_clip": 0.01150721, "auxiliary_loss_mlp": 0.01050593, "balance_loss_clip": 1.05157495, "balance_loss_mlp": 1.03235435, "epoch": 0.2721779648278972, "flos": 12676826597760.0, "grad_norm": 2.689071598576449, "language_loss": 0.77230763, "learning_rate": 3.4156586258014566e-06, "loss": 0.79432082, "num_input_tokens_seen": 97901500, "step": 4527, "time_per_iteration": 2.6060924530029297 }, { "auxiliary_loss_clip": 0.01109469, "auxiliary_loss_mlp": 0.00777538, "balance_loss_clip": 1.04898691, "balance_loss_mlp": 1.00073338, "epoch": 0.27223808808056515, "flos": 16253206099200.0, "grad_norm": 2.5564103940467313, "language_loss": 0.8187297, "learning_rate": 3.415383489652503e-06, "loss": 0.83759975, "num_input_tokens_seen": 97917800, "step": 4528, "time_per_iteration": 2.697845458984375 }, { "auxiliary_loss_clip": 0.01116518, "auxiliary_loss_mlp": 0.01058829, "balance_loss_clip": 1.05005443, "balance_loss_mlp": 1.04094744, "epoch": 0.2722982113332331, "flos": 27745805203200.0, "grad_norm": 1.774189879269534, "language_loss": 0.77156031, "learning_rate": 3.4151082998320666e-06, "loss": 0.7933138, "num_input_tokens_seen": 97937225, "step": 4529, "time_per_iteration": 2.75425124168396 }, { "auxiliary_loss_clip": 0.01123493, "auxiliary_loss_mlp": 0.01053103, "balance_loss_clip": 1.0518961, "balance_loss_mlp": 1.03634179, "epoch": 0.2723583345859011, "flos": 21726243446400.0, "grad_norm": 2.104422440945624, "language_loss": 0.82359695, "learning_rate": 3.4148330563505805e-06, "loss": 0.84536296, "num_input_tokens_seen": 97956845, "step": 4530, "time_per_iteration": 2.6822023391723633 }, { "auxiliary_loss_clip": 0.01136812, "auxiliary_loss_mlp": 0.01047087, "balance_loss_clip": 1.05334496, "balance_loss_mlp": 1.02971828, "epoch": 0.27241845783856905, "flos": 17347260499200.0, "grad_norm": 2.321764638586046, "language_loss": 0.91554427, "learning_rate": 3.4145577592184838e-06, "loss": 0.93738323, "num_input_tokens_seen": 97972465, "step": 4531, "time_per_iteration": 2.6979331970214844 }, { "auxiliary_loss_clip": 0.01138188, "auxiliary_loss_mlp": 0.01046663, "balance_loss_clip": 1.05187678, "balance_loss_mlp": 1.02856672, "epoch": 0.272478581091237, "flos": 24754302858240.0, "grad_norm": 1.9110068503115385, "language_loss": 0.76398945, "learning_rate": 3.4142824084462155e-06, "loss": 0.78583801, "num_input_tokens_seen": 97990770, "step": 4532, "time_per_iteration": 2.6663877964019775 }, { "auxiliary_loss_clip": 0.01113354, "auxiliary_loss_mlp": 0.01040904, "balance_loss_clip": 1.05224109, "balance_loss_mlp": 1.02386856, "epoch": 0.272538704343905, "flos": 17890624512000.0, "grad_norm": 2.311201731752709, "language_loss": 0.88514459, "learning_rate": 3.4140070040442162e-06, "loss": 0.90668714, "num_input_tokens_seen": 98005775, "step": 4533, "time_per_iteration": 2.693161725997925 }, { "auxiliary_loss_clip": 0.01122748, "auxiliary_loss_mlp": 0.01040937, "balance_loss_clip": 1.05127299, "balance_loss_mlp": 1.02398562, "epoch": 0.272598827596573, "flos": 22932016122240.0, "grad_norm": 2.2174577403643245, "language_loss": 0.71288157, "learning_rate": 3.413731546022929e-06, "loss": 0.73451841, "num_input_tokens_seen": 98025750, "step": 4534, "time_per_iteration": 2.7371840476989746 }, { "auxiliary_loss_clip": 0.01121649, "auxiliary_loss_mlp": 0.01040323, "balance_loss_clip": 1.05089378, "balance_loss_mlp": 1.02177453, "epoch": 0.27265895084924097, "flos": 24238409771520.0, "grad_norm": 1.6997646677502514, "language_loss": 0.91605014, "learning_rate": 3.4134560343928005e-06, "loss": 0.93766987, "num_input_tokens_seen": 98044955, "step": 4535, "time_per_iteration": 2.72127103805542 }, { "auxiliary_loss_clip": 0.0113065, "auxiliary_loss_mlp": 0.01045251, "balance_loss_clip": 1.05495596, "balance_loss_mlp": 1.02739298, "epoch": 0.27271907410190893, "flos": 27013155494400.0, "grad_norm": 1.6448383128638457, "language_loss": 0.72919363, "learning_rate": 3.4131804691642778e-06, "loss": 0.7509526, "num_input_tokens_seen": 98065860, "step": 4536, "time_per_iteration": 2.778991460800171 }, { "auxiliary_loss_clip": 0.01137601, "auxiliary_loss_mlp": 0.01044231, "balance_loss_clip": 1.05134857, "balance_loss_mlp": 1.02601612, "epoch": 0.2727791973545769, "flos": 34452588942720.0, "grad_norm": 1.7760428855271044, "language_loss": 0.71682841, "learning_rate": 3.41290485034781e-06, "loss": 0.73864675, "num_input_tokens_seen": 98085450, "step": 4537, "time_per_iteration": 2.7746009826660156 }, { "auxiliary_loss_clip": 0.01119602, "auxiliary_loss_mlp": 0.01042982, "balance_loss_clip": 1.04899096, "balance_loss_mlp": 1.02455187, "epoch": 0.27283932060724486, "flos": 15041723160960.0, "grad_norm": 2.103574663853892, "language_loss": 0.77419543, "learning_rate": 3.4126291779538485e-06, "loss": 0.79582125, "num_input_tokens_seen": 98099115, "step": 4538, "time_per_iteration": 2.6432113647460938 }, { "auxiliary_loss_clip": 0.011333, "auxiliary_loss_mlp": 0.01044735, "balance_loss_clip": 1.05075216, "balance_loss_mlp": 1.02784324, "epoch": 0.2728994438599128, "flos": 21652411040640.0, "grad_norm": 1.824827492408775, "language_loss": 0.90160263, "learning_rate": 3.412353451992847e-06, "loss": 0.923383, "num_input_tokens_seen": 98118415, "step": 4539, "time_per_iteration": 2.620088815689087 }, { "auxiliary_loss_clip": 0.0112346, "auxiliary_loss_mlp": 0.01044264, "balance_loss_clip": 1.04970992, "balance_loss_mlp": 1.0250001, "epoch": 0.2729595671125808, "flos": 17488424949120.0, "grad_norm": 1.7778813807473632, "language_loss": 0.88033229, "learning_rate": 3.4120776724752607e-06, "loss": 0.90200949, "num_input_tokens_seen": 98136300, "step": 4540, "time_per_iteration": 2.7115092277526855 }, { "auxiliary_loss_clip": 0.01139055, "auxiliary_loss_mlp": 0.00775653, "balance_loss_clip": 1.0515871, "balance_loss_mlp": 1.00068974, "epoch": 0.27301969036524876, "flos": 19318145800320.0, "grad_norm": 3.2240434674097758, "language_loss": 0.82471287, "learning_rate": 3.4118018394115476e-06, "loss": 0.84385997, "num_input_tokens_seen": 98154580, "step": 4541, "time_per_iteration": 2.6112682819366455 }, { "auxiliary_loss_clip": 0.01123955, "auxiliary_loss_mlp": 0.01045117, "balance_loss_clip": 1.05166435, "balance_loss_mlp": 1.02798617, "epoch": 0.2730798136179167, "flos": 21065666376960.0, "grad_norm": 2.102491799578544, "language_loss": 0.79535306, "learning_rate": 3.4115259528121678e-06, "loss": 0.81704378, "num_input_tokens_seen": 98173115, "step": 4542, "time_per_iteration": 2.7202932834625244 }, { "auxiliary_loss_clip": 0.01130053, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.05406725, "balance_loss_mlp": 1.02263296, "epoch": 0.2731399368705847, "flos": 19171737964800.0, "grad_norm": 1.955696716620197, "language_loss": 0.89326978, "learning_rate": 3.411250012687582e-06, "loss": 0.91496956, "num_input_tokens_seen": 98190260, "step": 4543, "time_per_iteration": 2.6846654415130615 }, { "auxiliary_loss_clip": 0.01118776, "auxiliary_loss_mlp": 0.00776653, "balance_loss_clip": 1.04913735, "balance_loss_mlp": 1.00080073, "epoch": 0.27320006012325265, "flos": 18290130554880.0, "grad_norm": 2.4410785724718997, "language_loss": 0.64012986, "learning_rate": 3.410974019048255e-06, "loss": 0.65908414, "num_input_tokens_seen": 98207115, "step": 4544, "time_per_iteration": 2.6373775005340576 }, { "auxiliary_loss_clip": 0.01123945, "auxiliary_loss_mlp": 0.01044578, "balance_loss_clip": 1.05455351, "balance_loss_mlp": 1.02582633, "epoch": 0.2732601833759206, "flos": 34860929731200.0, "grad_norm": 3.5876362405970643, "language_loss": 0.69788039, "learning_rate": 3.410697971904651e-06, "loss": 0.71956557, "num_input_tokens_seen": 98230610, "step": 4545, "time_per_iteration": 2.7943291664123535 }, { "auxiliary_loss_clip": 0.0103839, "auxiliary_loss_mlp": 0.01023664, "balance_loss_clip": 1.02576709, "balance_loss_mlp": 1.02123213, "epoch": 0.2733203066285886, "flos": 53910824762880.0, "grad_norm": 0.7314456658795918, "language_loss": 0.61636353, "learning_rate": 3.4104218712672383e-06, "loss": 0.63698411, "num_input_tokens_seen": 98293585, "step": 4546, "time_per_iteration": 3.2244455814361572 }, { "auxiliary_loss_clip": 0.0105925, "auxiliary_loss_mlp": 0.01053726, "balance_loss_clip": 1.04915786, "balance_loss_mlp": 1.03472424, "epoch": 0.2733804298812566, "flos": 20660378244480.0, "grad_norm": 1.905103737754333, "language_loss": 0.6467241, "learning_rate": 3.410145717146488e-06, "loss": 0.66785389, "num_input_tokens_seen": 98311680, "step": 4547, "time_per_iteration": 2.7815287113189697 }, { "auxiliary_loss_clip": 0.01123347, "auxiliary_loss_mlp": 0.00774125, "balance_loss_clip": 1.05267262, "balance_loss_mlp": 1.00081313, "epoch": 0.27344055313392457, "flos": 25884339707520.0, "grad_norm": 1.90846373489731, "language_loss": 0.77248073, "learning_rate": 3.4098695095528694e-06, "loss": 0.79145551, "num_input_tokens_seen": 98330770, "step": 4548, "time_per_iteration": 2.8113017082214355 }, { "auxiliary_loss_clip": 0.01122557, "auxiliary_loss_mlp": 0.01050902, "balance_loss_clip": 1.05430245, "balance_loss_mlp": 1.03526139, "epoch": 0.27350067638659253, "flos": 22929753565440.0, "grad_norm": 1.9713428286290122, "language_loss": 0.82792878, "learning_rate": 3.4095932484968585e-06, "loss": 0.84966338, "num_input_tokens_seen": 98349860, "step": 4549, "time_per_iteration": 2.6938650608062744 }, { "auxiliary_loss_clip": 0.01135405, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.04898036, "balance_loss_mlp": 1.02902281, "epoch": 0.2735607996392605, "flos": 16574821499520.0, "grad_norm": 3.4543610040263655, "language_loss": 0.71193838, "learning_rate": 3.4093169339889305e-06, "loss": 0.73377967, "num_input_tokens_seen": 98367040, "step": 4550, "time_per_iteration": 2.638643503189087 }, { "auxiliary_loss_clip": 0.01107347, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 1.05066109, "balance_loss_mlp": 1.02569556, "epoch": 0.27362092289192846, "flos": 19645291895040.0, "grad_norm": 3.3050607953849576, "language_loss": 0.78899491, "learning_rate": 3.409040566039563e-06, "loss": 0.81049079, "num_input_tokens_seen": 98384010, "step": 4551, "time_per_iteration": 4.352613210678101 }, { "auxiliary_loss_clip": 0.01107945, "auxiliary_loss_mlp": 0.01052105, "balance_loss_clip": 1.04898548, "balance_loss_mlp": 1.03342533, "epoch": 0.27368104614459643, "flos": 17639142416640.0, "grad_norm": 2.480443972085862, "language_loss": 0.71220398, "learning_rate": 3.4087641446592362e-06, "loss": 0.73380452, "num_input_tokens_seen": 98399625, "step": 4552, "time_per_iteration": 4.194540739059448 }, { "auxiliary_loss_clip": 0.01123037, "auxiliary_loss_mlp": 0.01045225, "balance_loss_clip": 1.05144608, "balance_loss_mlp": 1.0275104, "epoch": 0.2737411693972644, "flos": 21580015178880.0, "grad_norm": 2.1026303213651967, "language_loss": 0.71636003, "learning_rate": 3.408487669858431e-06, "loss": 0.73804259, "num_input_tokens_seen": 98417310, "step": 4553, "time_per_iteration": 2.7323882579803467 }, { "auxiliary_loss_clip": 0.01134032, "auxiliary_loss_mlp": 0.01045217, "balance_loss_clip": 1.05039358, "balance_loss_mlp": 1.02658415, "epoch": 0.27380129264993236, "flos": 25484043565440.0, "grad_norm": 1.7325126580228065, "language_loss": 0.58917797, "learning_rate": 3.4082111416476337e-06, "loss": 0.6109705, "num_input_tokens_seen": 98438670, "step": 4554, "time_per_iteration": 2.7384533882141113 }, { "auxiliary_loss_clip": 0.01129927, "auxiliary_loss_mlp": 0.01042216, "balance_loss_clip": 1.05440903, "balance_loss_mlp": 1.02400088, "epoch": 0.2738614159026003, "flos": 18661196004480.0, "grad_norm": 1.7915916386168997, "language_loss": 0.73645991, "learning_rate": 3.4079345600373275e-06, "loss": 0.75818133, "num_input_tokens_seen": 98456060, "step": 4555, "time_per_iteration": 2.742417335510254 }, { "auxiliary_loss_clip": 0.01141373, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.0561738, "balance_loss_mlp": 1.02152658, "epoch": 0.2739215391552683, "flos": 23477139901440.0, "grad_norm": 2.8904145278515303, "language_loss": 0.77755523, "learning_rate": 3.407657925038002e-06, "loss": 0.79936051, "num_input_tokens_seen": 98473765, "step": 4556, "time_per_iteration": 4.419378280639648 }, { "auxiliary_loss_clip": 0.01150896, "auxiliary_loss_mlp": 0.01049261, "balance_loss_clip": 1.05645621, "balance_loss_mlp": 1.02959132, "epoch": 0.27398166240793626, "flos": 17128636369920.0, "grad_norm": 7.460972643049535, "language_loss": 0.82236463, "learning_rate": 3.4073812366601473e-06, "loss": 0.84436619, "num_input_tokens_seen": 98490590, "step": 4557, "time_per_iteration": 2.6087756156921387 }, { "auxiliary_loss_clip": 0.01089746, "auxiliary_loss_mlp": 0.01046447, "balance_loss_clip": 1.04229808, "balance_loss_mlp": 1.02811229, "epoch": 0.2740417856606042, "flos": 23404744039680.0, "grad_norm": 2.034332886344347, "language_loss": 0.7293033, "learning_rate": 3.4071044949142547e-06, "loss": 0.75066525, "num_input_tokens_seen": 98510590, "step": 4558, "time_per_iteration": 2.7908921241760254 }, { "auxiliary_loss_clip": 0.0112554, "auxiliary_loss_mlp": 0.01051481, "balance_loss_clip": 1.05215442, "balance_loss_mlp": 1.03334939, "epoch": 0.2741019089132722, "flos": 12780428400000.0, "grad_norm": 2.134307291688894, "language_loss": 0.67842996, "learning_rate": 3.406827699810819e-06, "loss": 0.70020014, "num_input_tokens_seen": 98527875, "step": 4559, "time_per_iteration": 2.7246246337890625 }, { "auxiliary_loss_clip": 0.01121642, "auxiliary_loss_mlp": 0.01055203, "balance_loss_clip": 1.04958165, "balance_loss_mlp": 1.03646374, "epoch": 0.27416203216594015, "flos": 20631542601600.0, "grad_norm": 2.095192605103166, "language_loss": 0.7249226, "learning_rate": 3.4065508513603353e-06, "loss": 0.74669105, "num_input_tokens_seen": 98547575, "step": 4560, "time_per_iteration": 2.634526252746582 }, { "auxiliary_loss_clip": 0.01131443, "auxiliary_loss_mlp": 0.01049928, "balance_loss_clip": 1.05592251, "balance_loss_mlp": 1.03115225, "epoch": 0.27422215541860817, "flos": 26541576812160.0, "grad_norm": 2.095026193088577, "language_loss": 0.81413525, "learning_rate": 3.406273949573303e-06, "loss": 0.83594894, "num_input_tokens_seen": 98566290, "step": 4561, "time_per_iteration": 2.711106538772583 }, { "auxiliary_loss_clip": 0.01156737, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.05919766, "balance_loss_mlp": 1.02688003, "epoch": 0.27428227867127614, "flos": 23331163029120.0, "grad_norm": 1.7066421621801435, "language_loss": 0.75436246, "learning_rate": 3.4059969944602214e-06, "loss": 0.77636886, "num_input_tokens_seen": 98586255, "step": 4562, "time_per_iteration": 2.699544668197632 }, { "auxiliary_loss_clip": 0.01155238, "auxiliary_loss_mlp": 0.01038722, "balance_loss_clip": 1.06035113, "balance_loss_mlp": 1.02138865, "epoch": 0.2743424019239441, "flos": 23035115134080.0, "grad_norm": 1.784616644228294, "language_loss": 0.74751598, "learning_rate": 3.4057199860315928e-06, "loss": 0.76945561, "num_input_tokens_seen": 98606030, "step": 4563, "time_per_iteration": 2.788313627243042 }, { "auxiliary_loss_clip": 0.01119321, "auxiliary_loss_mlp": 0.01048987, "balance_loss_clip": 1.04918432, "balance_loss_mlp": 1.02912664, "epoch": 0.27440252517661207, "flos": 21981101420160.0, "grad_norm": 1.7657560231579414, "language_loss": 0.63026172, "learning_rate": 3.4054429242979213e-06, "loss": 0.65194476, "num_input_tokens_seen": 98625225, "step": 4564, "time_per_iteration": 2.810922145843506 }, { "auxiliary_loss_clip": 0.01128901, "auxiliary_loss_mlp": 0.01046032, "balance_loss_clip": 1.05438292, "balance_loss_mlp": 1.02732766, "epoch": 0.27446264842928003, "flos": 40187451502080.0, "grad_norm": 1.9571814389681148, "language_loss": 0.78683448, "learning_rate": 3.4051658092697135e-06, "loss": 0.8085838, "num_input_tokens_seen": 98649470, "step": 4565, "time_per_iteration": 2.846803665161133 }, { "auxiliary_loss_clip": 0.01095875, "auxiliary_loss_mlp": 0.01050978, "balance_loss_clip": 1.04981828, "balance_loss_mlp": 1.03370428, "epoch": 0.274522771681948, "flos": 13479681438720.0, "grad_norm": 2.4708024317398003, "language_loss": 0.68715227, "learning_rate": 3.404888640957477e-06, "loss": 0.70862079, "num_input_tokens_seen": 98666915, "step": 4566, "time_per_iteration": 2.714352607727051 }, { "auxiliary_loss_clip": 0.01142259, "auxiliary_loss_mlp": 0.01049797, "balance_loss_clip": 1.05835438, "balance_loss_mlp": 1.03326273, "epoch": 0.27458289493461596, "flos": 28622133313920.0, "grad_norm": 2.1203833431876435, "language_loss": 0.60966527, "learning_rate": 3.404611419371723e-06, "loss": 0.63158584, "num_input_tokens_seen": 98688240, "step": 4567, "time_per_iteration": 2.71791934967041 }, { "auxiliary_loss_clip": 0.01135855, "auxiliary_loss_mlp": 0.01047435, "balance_loss_clip": 1.05527198, "balance_loss_mlp": 1.02756321, "epoch": 0.2746430181872839, "flos": 20119815492480.0, "grad_norm": 4.134990661591929, "language_loss": 0.82529241, "learning_rate": 3.4043341445229627e-06, "loss": 0.84712529, "num_input_tokens_seen": 98708245, "step": 4568, "time_per_iteration": 2.6779236793518066 }, { "auxiliary_loss_clip": 0.01141648, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.06012177, "balance_loss_mlp": 1.01916456, "epoch": 0.2747031414399519, "flos": 20193468330240.0, "grad_norm": 2.0524329167860254, "language_loss": 0.68425417, "learning_rate": 3.4040568164217117e-06, "loss": 0.70604521, "num_input_tokens_seen": 98724575, "step": 4569, "time_per_iteration": 2.6595280170440674 }, { "auxiliary_loss_clip": 0.0111585, "auxiliary_loss_mlp": 0.01047943, "balance_loss_clip": 1.04627442, "balance_loss_mlp": 1.02938235, "epoch": 0.27476326469261986, "flos": 13516346246400.0, "grad_norm": 2.9457223850766283, "language_loss": 0.70966327, "learning_rate": 3.4037794350784848e-06, "loss": 0.73130119, "num_input_tokens_seen": 98740700, "step": 4570, "time_per_iteration": 2.7404215335845947 }, { "auxiliary_loss_clip": 0.01035018, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.03062916, "balance_loss_mlp": 1.02521896, "epoch": 0.2748233879452878, "flos": 65937127121280.0, "grad_norm": 0.7294499123437721, "language_loss": 0.55835986, "learning_rate": 3.4035020005038014e-06, "loss": 0.57898545, "num_input_tokens_seen": 98803030, "step": 4571, "time_per_iteration": 3.369403123855591 }, { "auxiliary_loss_clip": 0.01096573, "auxiliary_loss_mlp": 0.0104917, "balance_loss_clip": 1.0493505, "balance_loss_mlp": 1.03134847, "epoch": 0.2748835111979558, "flos": 17384212615680.0, "grad_norm": 2.8212366896407772, "language_loss": 0.78388298, "learning_rate": 3.4032245127081812e-06, "loss": 0.80534041, "num_input_tokens_seen": 98820505, "step": 4572, "time_per_iteration": 2.835817813873291 }, { "auxiliary_loss_clip": 0.01145371, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.05474758, "balance_loss_mlp": 1.02365255, "epoch": 0.27494363445062375, "flos": 23587565287680.0, "grad_norm": 3.882915196153325, "language_loss": 0.8126958, "learning_rate": 3.402946971702147e-06, "loss": 0.83453798, "num_input_tokens_seen": 98842150, "step": 4573, "time_per_iteration": 2.709415912628174 }, { "auxiliary_loss_clip": 0.01135124, "auxiliary_loss_mlp": 0.01042886, "balance_loss_clip": 1.0529685, "balance_loss_mlp": 1.0252434, "epoch": 0.2750037577032918, "flos": 17164582905600.0, "grad_norm": 1.740498780022663, "language_loss": 0.79043669, "learning_rate": 3.402669377496223e-06, "loss": 0.81221676, "num_input_tokens_seen": 98861050, "step": 4574, "time_per_iteration": 2.651921272277832 }, { "auxiliary_loss_clip": 0.01104251, "auxiliary_loss_mlp": 0.01052183, "balance_loss_clip": 1.05164313, "balance_loss_mlp": 1.03518367, "epoch": 0.27506388095595974, "flos": 24491903028480.0, "grad_norm": 2.03666793953709, "language_loss": 0.74517256, "learning_rate": 3.402391730100936e-06, "loss": 0.76673687, "num_input_tokens_seen": 98879695, "step": 4575, "time_per_iteration": 2.7622992992401123 }, { "auxiliary_loss_clip": 0.01126178, "auxiliary_loss_mlp": 0.01042992, "balance_loss_clip": 1.05188203, "balance_loss_mlp": 1.02700627, "epoch": 0.2751240042086277, "flos": 38764706722560.0, "grad_norm": 2.5671977719319745, "language_loss": 0.71951419, "learning_rate": 3.402114029526814e-06, "loss": 0.74120593, "num_input_tokens_seen": 98902035, "step": 4576, "time_per_iteration": 2.85740065574646 }, { "auxiliary_loss_clip": 0.01102681, "auxiliary_loss_mlp": 0.00778132, "balance_loss_clip": 1.0506314, "balance_loss_mlp": 1.00075579, "epoch": 0.27518412746129567, "flos": 26907039740160.0, "grad_norm": 1.8050360629969575, "language_loss": 0.73217857, "learning_rate": 3.4018362757843866e-06, "loss": 0.7509867, "num_input_tokens_seen": 98921835, "step": 4577, "time_per_iteration": 2.9024770259857178 }, { "auxiliary_loss_clip": 0.01130618, "auxiliary_loss_mlp": 0.01043838, "balance_loss_clip": 1.05657601, "balance_loss_mlp": 1.02571797, "epoch": 0.27524425071396363, "flos": 24900531125760.0, "grad_norm": 1.7818656930434014, "language_loss": 0.76073247, "learning_rate": 3.401558468884188e-06, "loss": 0.78247702, "num_input_tokens_seen": 98939610, "step": 4578, "time_per_iteration": 2.7173874378204346 }, { "auxiliary_loss_clip": 0.01120877, "auxiliary_loss_mlp": 0.01047646, "balance_loss_clip": 1.05252147, "balance_loss_mlp": 1.02741659, "epoch": 0.2753043739666316, "flos": 26288047641600.0, "grad_norm": 2.6134371594901773, "language_loss": 0.66563278, "learning_rate": 3.4012806088367516e-06, "loss": 0.68731803, "num_input_tokens_seen": 98962250, "step": 4579, "time_per_iteration": 2.730104446411133 }, { "auxiliary_loss_clip": 0.01113502, "auxiliary_loss_mlp": 0.01058443, "balance_loss_clip": 1.04683816, "balance_loss_mlp": 1.03911948, "epoch": 0.27536449721929956, "flos": 24206772867840.0, "grad_norm": 1.8779975195253575, "language_loss": 0.80174518, "learning_rate": 3.4010026956526137e-06, "loss": 0.82346463, "num_input_tokens_seen": 98981845, "step": 4580, "time_per_iteration": 2.8395349979400635 }, { "auxiliary_loss_clip": 0.01141995, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.05684924, "balance_loss_mlp": 1.02942991, "epoch": 0.27542462047196753, "flos": 19537272720000.0, "grad_norm": 1.5301552660019138, "language_loss": 0.67242241, "learning_rate": 3.4007247293423137e-06, "loss": 0.69434267, "num_input_tokens_seen": 99001855, "step": 4581, "time_per_iteration": 2.788644552230835 }, { "auxiliary_loss_clip": 0.01132258, "auxiliary_loss_mlp": 0.0104746, "balance_loss_clip": 1.0560689, "balance_loss_mlp": 1.03050864, "epoch": 0.2754847437246355, "flos": 14319165173760.0, "grad_norm": 1.785645052077455, "language_loss": 0.77915615, "learning_rate": 3.400446709916392e-06, "loss": 0.80095327, "num_input_tokens_seen": 99019880, "step": 4582, "time_per_iteration": 2.730393409729004 }, { "auxiliary_loss_clip": 0.0110084, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.05119133, "balance_loss_mlp": 1.02575767, "epoch": 0.27554486697730346, "flos": 18838773866880.0, "grad_norm": 1.737971373642785, "language_loss": 0.84479475, "learning_rate": 3.4001686373853895e-06, "loss": 0.86622572, "num_input_tokens_seen": 99037570, "step": 4583, "time_per_iteration": 2.7274270057678223 }, { "auxiliary_loss_clip": 0.01139632, "auxiliary_loss_mlp": 0.01044098, "balance_loss_clip": 1.05364764, "balance_loss_mlp": 1.02693176, "epoch": 0.2756049902299714, "flos": 22382295402240.0, "grad_norm": 1.6883560409679848, "language_loss": 0.67007428, "learning_rate": 3.3998905117598528e-06, "loss": 0.69191158, "num_input_tokens_seen": 99056875, "step": 4584, "time_per_iteration": 2.643176794052124 }, { "auxiliary_loss_clip": 0.01080495, "auxiliary_loss_mlp": 0.01054092, "balance_loss_clip": 1.04106402, "balance_loss_mlp": 1.03475666, "epoch": 0.2756651134826394, "flos": 19573901614080.0, "grad_norm": 1.8352571769398758, "language_loss": 0.77349764, "learning_rate": 3.399612333050327e-06, "loss": 0.79484355, "num_input_tokens_seen": 99074685, "step": 4585, "time_per_iteration": 2.6824886798858643 }, { "auxiliary_loss_clip": 0.01142822, "auxiliary_loss_mlp": 0.00775816, "balance_loss_clip": 1.05703616, "balance_loss_mlp": 1.00084651, "epoch": 0.27572523673530736, "flos": 23586559706880.0, "grad_norm": 1.697985370469672, "language_loss": 0.7201665, "learning_rate": 3.399334101267362e-06, "loss": 0.73935288, "num_input_tokens_seen": 99095300, "step": 4586, "time_per_iteration": 2.672872304916382 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.05329537, "balance_loss_mlp": 1.02184618, "epoch": 0.2757853599879754, "flos": 22820118278400.0, "grad_norm": 2.166019285475688, "language_loss": 0.80385983, "learning_rate": 3.3990558164215073e-06, "loss": 0.82550168, "num_input_tokens_seen": 99115965, "step": 4587, "time_per_iteration": 2.716212272644043 }, { "auxiliary_loss_clip": 0.01139286, "auxiliary_loss_mlp": 0.0104661, "balance_loss_clip": 1.05435753, "balance_loss_mlp": 1.02916992, "epoch": 0.27584548324064334, "flos": 18551704371840.0, "grad_norm": 3.416975868515595, "language_loss": 0.83000016, "learning_rate": 3.398777478523316e-06, "loss": 0.85185915, "num_input_tokens_seen": 99134265, "step": 4588, "time_per_iteration": 2.6104485988616943 }, { "auxiliary_loss_clip": 0.01109827, "auxiliary_loss_mlp": 0.01042868, "balance_loss_clip": 1.04756808, "balance_loss_mlp": 1.02567828, "epoch": 0.2759056064933113, "flos": 23769883745280.0, "grad_norm": 1.3306263403060763, "language_loss": 0.75309169, "learning_rate": 3.398499087583342e-06, "loss": 0.77461863, "num_input_tokens_seen": 99156185, "step": 4589, "time_per_iteration": 4.333514928817749 }, { "auxiliary_loss_clip": 0.01138237, "auxiliary_loss_mlp": 0.01046648, "balance_loss_clip": 1.0555464, "balance_loss_mlp": 1.02944636, "epoch": 0.27596572974597927, "flos": 24281898163200.0, "grad_norm": 1.9812216556422375, "language_loss": 0.8860873, "learning_rate": 3.398220643612143e-06, "loss": 0.90793616, "num_input_tokens_seen": 99176735, "step": 4590, "time_per_iteration": 4.256460428237915 }, { "auxiliary_loss_clip": 0.01132985, "auxiliary_loss_mlp": 0.01048634, "balance_loss_clip": 1.05280411, "balance_loss_mlp": 1.03025222, "epoch": 0.27602585299864724, "flos": 35040985632000.0, "grad_norm": 1.594737426944321, "language_loss": 0.71265185, "learning_rate": 3.397942146620277e-06, "loss": 0.7344681, "num_input_tokens_seen": 99199765, "step": 4591, "time_per_iteration": 2.8263018131256104 }, { "auxiliary_loss_clip": 0.01114882, "auxiliary_loss_mlp": 0.01048296, "balance_loss_clip": 1.05395412, "balance_loss_mlp": 1.0301044, "epoch": 0.2760859762513152, "flos": 24309405002880.0, "grad_norm": 3.793452037579163, "language_loss": 0.80017495, "learning_rate": 3.3976635966183046e-06, "loss": 0.82180673, "num_input_tokens_seen": 99218435, "step": 4592, "time_per_iteration": 4.289790153503418 }, { "auxiliary_loss_clip": 0.01051224, "auxiliary_loss_mlp": 0.00755885, "balance_loss_clip": 1.02655387, "balance_loss_mlp": 1.00253439, "epoch": 0.27614609950398317, "flos": 71260739890560.0, "grad_norm": 0.710408868807485, "language_loss": 0.61613023, "learning_rate": 3.3973849936167886e-06, "loss": 0.63420129, "num_input_tokens_seen": 99276200, "step": 4593, "time_per_iteration": 3.201831817626953 }, { "auxiliary_loss_clip": 0.01130969, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.05307889, "balance_loss_mlp": 1.02640104, "epoch": 0.27620622275665113, "flos": 29674854138240.0, "grad_norm": 1.9659750468178385, "language_loss": 0.778301, "learning_rate": 3.3971063376262937e-06, "loss": 0.80004054, "num_input_tokens_seen": 99297625, "step": 4594, "time_per_iteration": 2.7222111225128174 }, { "auxiliary_loss_clip": 0.0113791, "auxiliary_loss_mlp": 0.01038649, "balance_loss_clip": 1.05557215, "balance_loss_mlp": 1.02168524, "epoch": 0.2762663460093191, "flos": 15378063137280.0, "grad_norm": 1.5118783378909677, "language_loss": 0.91944981, "learning_rate": 3.3968276286573866e-06, "loss": 0.9412154, "num_input_tokens_seen": 99315790, "step": 4595, "time_per_iteration": 4.290736198425293 }, { "auxiliary_loss_clip": 0.01134891, "auxiliary_loss_mlp": 0.01052323, "balance_loss_clip": 1.05374146, "balance_loss_mlp": 1.03413117, "epoch": 0.27632646926198706, "flos": 20704082117760.0, "grad_norm": 1.7744098894398055, "language_loss": 0.69208467, "learning_rate": 3.3965488667206353e-06, "loss": 0.71395689, "num_input_tokens_seen": 99334615, "step": 4596, "time_per_iteration": 2.7178540229797363 }, { "auxiliary_loss_clip": 0.01125254, "auxiliary_loss_mlp": 0.01048102, "balance_loss_clip": 1.05075955, "balance_loss_mlp": 1.02977943, "epoch": 0.276386592514655, "flos": 32813374849920.0, "grad_norm": 1.7305541104386353, "language_loss": 0.63536781, "learning_rate": 3.3962700518266113e-06, "loss": 0.65710139, "num_input_tokens_seen": 99356685, "step": 4597, "time_per_iteration": 2.7713348865509033 }, { "auxiliary_loss_clip": 0.01150233, "auxiliary_loss_mlp": 0.01046127, "balance_loss_clip": 1.05762243, "balance_loss_mlp": 1.02949786, "epoch": 0.276446715767323, "flos": 18551704371840.0, "grad_norm": 2.077440653118394, "language_loss": 0.86298984, "learning_rate": 3.395991183985887e-06, "loss": 0.8849535, "num_input_tokens_seen": 99374810, "step": 4598, "time_per_iteration": 2.6077804565429688 }, { "auxiliary_loss_clip": 0.01151532, "auxiliary_loss_mlp": 0.01046218, "balance_loss_clip": 1.0559516, "balance_loss_mlp": 1.02790797, "epoch": 0.27650683901999096, "flos": 22819615488000.0, "grad_norm": 2.6195813063936493, "language_loss": 0.79957914, "learning_rate": 3.395712263209037e-06, "loss": 0.82155669, "num_input_tokens_seen": 99391290, "step": 4599, "time_per_iteration": 2.67372989654541 }, { "auxiliary_loss_clip": 0.01127397, "auxiliary_loss_mlp": 0.01049332, "balance_loss_clip": 1.04922533, "balance_loss_mlp": 1.03152239, "epoch": 0.276566962272659, "flos": 21361534704000.0, "grad_norm": 1.7492576371751551, "language_loss": 0.78788924, "learning_rate": 3.395433289506639e-06, "loss": 0.80965656, "num_input_tokens_seen": 99409120, "step": 4600, "time_per_iteration": 2.7197396755218506 }, { "auxiliary_loss_clip": 0.01119636, "auxiliary_loss_mlp": 0.01049981, "balance_loss_clip": 1.05458808, "balance_loss_mlp": 1.03226674, "epoch": 0.27662708552532694, "flos": 17710604524800.0, "grad_norm": 2.9827767838021906, "language_loss": 0.7372371, "learning_rate": 3.3951542628892694e-06, "loss": 0.75893331, "num_input_tokens_seen": 99426180, "step": 4601, "time_per_iteration": 2.7212698459625244 }, { "auxiliary_loss_clip": 0.01137986, "auxiliary_loss_mlp": 0.01053484, "balance_loss_clip": 1.05503917, "balance_loss_mlp": 1.03514934, "epoch": 0.2766872087779949, "flos": 21252725429760.0, "grad_norm": 1.7018676665174548, "language_loss": 0.80055201, "learning_rate": 3.3948751833675113e-06, "loss": 0.82246667, "num_input_tokens_seen": 99447720, "step": 4602, "time_per_iteration": 2.6929776668548584 }, { "auxiliary_loss_clip": 0.01131471, "auxiliary_loss_mlp": 0.01060998, "balance_loss_clip": 1.05209374, "balance_loss_mlp": 1.04194784, "epoch": 0.2767473320306629, "flos": 12931900053120.0, "grad_norm": 2.3561631161543986, "language_loss": 0.77018148, "learning_rate": 3.3945960509519455e-06, "loss": 0.79210615, "num_input_tokens_seen": 99464720, "step": 4603, "time_per_iteration": 2.7761597633361816 }, { "auxiliary_loss_clip": 0.01118804, "auxiliary_loss_mlp": 0.01044782, "balance_loss_clip": 1.05331254, "balance_loss_mlp": 1.02858686, "epoch": 0.27680745528333084, "flos": 15012851604480.0, "grad_norm": 1.686999686787164, "language_loss": 0.81469357, "learning_rate": 3.3943168656531585e-06, "loss": 0.83632934, "num_input_tokens_seen": 99482310, "step": 4604, "time_per_iteration": 2.6715614795684814 }, { "auxiliary_loss_clip": 0.01096642, "auxiliary_loss_mlp": 0.0104217, "balance_loss_clip": 1.04733086, "balance_loss_mlp": 1.02428889, "epoch": 0.2768675785359988, "flos": 22637835734400.0, "grad_norm": 1.8500484413544072, "language_loss": 0.7021662, "learning_rate": 3.3940376274817363e-06, "loss": 0.72355425, "num_input_tokens_seen": 99501255, "step": 4605, "time_per_iteration": 2.824810266494751 }, { "auxiliary_loss_clip": 0.01051326, "auxiliary_loss_mlp": 0.01005015, "balance_loss_clip": 1.02826095, "balance_loss_mlp": 1.00244009, "epoch": 0.27692770178866677, "flos": 66130542881280.0, "grad_norm": 0.7013581781305706, "language_loss": 0.57222801, "learning_rate": 3.3937583364482673e-06, "loss": 0.59279138, "num_input_tokens_seen": 99568925, "step": 4606, "time_per_iteration": 3.288269519805908 }, { "auxiliary_loss_clip": 0.01125032, "auxiliary_loss_mlp": 0.01050719, "balance_loss_clip": 1.05177283, "balance_loss_mlp": 1.03280139, "epoch": 0.27698782504133473, "flos": 26464979059200.0, "grad_norm": 1.9503980757161308, "language_loss": 0.69579148, "learning_rate": 3.3934789925633424e-06, "loss": 0.71754897, "num_input_tokens_seen": 99588455, "step": 4607, "time_per_iteration": 2.7865042686462402 }, { "auxiliary_loss_clip": 0.0113039, "auxiliary_loss_mlp": 0.01040949, "balance_loss_clip": 1.05402029, "balance_loss_mlp": 1.0242002, "epoch": 0.2770479482940027, "flos": 25884806584320.0, "grad_norm": 1.5552750364168406, "language_loss": 0.69727945, "learning_rate": 3.393199595837555e-06, "loss": 0.71899283, "num_input_tokens_seen": 99609355, "step": 4608, "time_per_iteration": 2.7139909267425537 }, { "auxiliary_loss_clip": 0.0109619, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.04789758, "balance_loss_mlp": 1.024894, "epoch": 0.27710807154667066, "flos": 22857249962880.0, "grad_norm": 1.922338327624115, "language_loss": 0.73170602, "learning_rate": 3.392920146281499e-06, "loss": 0.75308412, "num_input_tokens_seen": 99628780, "step": 4609, "time_per_iteration": 2.8674490451812744 }, { "auxiliary_loss_clip": 0.01105896, "auxiliary_loss_mlp": 0.01054215, "balance_loss_clip": 1.04444993, "balance_loss_mlp": 1.03615475, "epoch": 0.27716819479933863, "flos": 17711071401600.0, "grad_norm": 2.284482242639661, "language_loss": 0.84028268, "learning_rate": 3.3926406439057714e-06, "loss": 0.86188376, "num_input_tokens_seen": 99644545, "step": 4610, "time_per_iteration": 2.6861605644226074 }, { "auxiliary_loss_clip": 0.01074905, "auxiliary_loss_mlp": 0.00781444, "balance_loss_clip": 1.04093325, "balance_loss_mlp": 1.00102568, "epoch": 0.2772283180520066, "flos": 19646046080640.0, "grad_norm": 2.0943450829127044, "language_loss": 0.68915951, "learning_rate": 3.3923610887209705e-06, "loss": 0.70772296, "num_input_tokens_seen": 99663125, "step": 4611, "time_per_iteration": 2.799345016479492 }, { "auxiliary_loss_clip": 0.01144902, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.05466819, "balance_loss_mlp": 1.02591395, "epoch": 0.27728844130467456, "flos": 21032628842880.0, "grad_norm": 2.6988182686748785, "language_loss": 0.73646772, "learning_rate": 3.392081480737698e-06, "loss": 0.75834239, "num_input_tokens_seen": 99682645, "step": 4612, "time_per_iteration": 2.643157720565796 }, { "auxiliary_loss_clip": 0.01139286, "auxiliary_loss_mlp": 0.00775997, "balance_loss_clip": 1.05283117, "balance_loss_mlp": 1.00099993, "epoch": 0.2773485645573425, "flos": 18989204025600.0, "grad_norm": 2.0654093622255436, "language_loss": 0.66356897, "learning_rate": 3.3918018199665563e-06, "loss": 0.68272179, "num_input_tokens_seen": 99700520, "step": 4613, "time_per_iteration": 2.6685144901275635 }, { "auxiliary_loss_clip": 0.01096758, "auxiliary_loss_mlp": 0.01051618, "balance_loss_clip": 1.04526055, "balance_loss_mlp": 1.03354573, "epoch": 0.27740868781001055, "flos": 21468440557440.0, "grad_norm": 1.5160858700983233, "language_loss": 0.79385912, "learning_rate": 3.39152210641815e-06, "loss": 0.8153429, "num_input_tokens_seen": 99720355, "step": 4614, "time_per_iteration": 2.82061505317688 }, { "auxiliary_loss_clip": 0.01129896, "auxiliary_loss_mlp": 0.01047714, "balance_loss_clip": 1.04873419, "balance_loss_mlp": 1.02978539, "epoch": 0.2774688110626785, "flos": 19827825834240.0, "grad_norm": 2.763943164845673, "language_loss": 0.80632633, "learning_rate": 3.3912423401030865e-06, "loss": 0.82810241, "num_input_tokens_seen": 99736090, "step": 4615, "time_per_iteration": 2.607448101043701 }, { "auxiliary_loss_clip": 0.01114657, "auxiliary_loss_mlp": 0.01051705, "balance_loss_clip": 1.04532576, "balance_loss_mlp": 1.03447962, "epoch": 0.2775289343153465, "flos": 18216226321920.0, "grad_norm": 2.3373471978129543, "language_loss": 0.646945, "learning_rate": 3.3909625210319735e-06, "loss": 0.66860855, "num_input_tokens_seen": 99751805, "step": 4616, "time_per_iteration": 2.693556308746338 }, { "auxiliary_loss_clip": 0.01133374, "auxiliary_loss_mlp": 0.01047225, "balance_loss_clip": 1.0536505, "balance_loss_mlp": 1.03001153, "epoch": 0.27758905756801444, "flos": 16472476673280.0, "grad_norm": 2.175848824107301, "language_loss": 0.82324976, "learning_rate": 3.3906826492154226e-06, "loss": 0.84505582, "num_input_tokens_seen": 99770610, "step": 4617, "time_per_iteration": 2.64677357673645 }, { "auxiliary_loss_clip": 0.01147475, "auxiliary_loss_mlp": 0.01049438, "balance_loss_clip": 1.05210304, "balance_loss_mlp": 1.03261721, "epoch": 0.2776491808206824, "flos": 18728240739840.0, "grad_norm": 2.8579401527932236, "language_loss": 0.77031851, "learning_rate": 3.3904027246640458e-06, "loss": 0.79228759, "num_input_tokens_seen": 99787305, "step": 4618, "time_per_iteration": 2.555001735687256 }, { "auxiliary_loss_clip": 0.01151182, "auxiliary_loss_mlp": 0.01042958, "balance_loss_clip": 1.05599475, "balance_loss_mlp": 1.0268048, "epoch": 0.27770930407335037, "flos": 28038189911040.0, "grad_norm": 1.6850470881083441, "language_loss": 0.85102153, "learning_rate": 3.390122747388459e-06, "loss": 0.87296283, "num_input_tokens_seen": 99808940, "step": 4619, "time_per_iteration": 2.753230094909668 }, { "auxiliary_loss_clip": 0.01121872, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.05075216, "balance_loss_mlp": 1.02592564, "epoch": 0.27776942732601834, "flos": 23549823072000.0, "grad_norm": 1.6763124645732197, "language_loss": 0.7707957, "learning_rate": 3.3898427173992778e-06, "loss": 0.79242951, "num_input_tokens_seen": 99829575, "step": 4620, "time_per_iteration": 2.7764816284179688 }, { "auxiliary_loss_clip": 0.01091863, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.04290819, "balance_loss_mlp": 1.02517962, "epoch": 0.2778295505786863, "flos": 23908713811200.0, "grad_norm": 1.985202794634515, "language_loss": 0.78144193, "learning_rate": 3.389562634707122e-06, "loss": 0.80278563, "num_input_tokens_seen": 99847575, "step": 4621, "time_per_iteration": 2.740419387817383 }, { "auxiliary_loss_clip": 0.01113871, "auxiliary_loss_mlp": 0.01054223, "balance_loss_clip": 1.04857588, "balance_loss_mlp": 1.03642535, "epoch": 0.27788967383135427, "flos": 25554571920000.0, "grad_norm": 2.864120631038579, "language_loss": 0.87357259, "learning_rate": 3.389282499322611e-06, "loss": 0.89525354, "num_input_tokens_seen": 99864995, "step": 4622, "time_per_iteration": 2.8351151943206787 }, { "auxiliary_loss_clip": 0.01096216, "auxiliary_loss_mlp": 0.01052098, "balance_loss_clip": 1.0477345, "balance_loss_mlp": 1.0349195, "epoch": 0.27794979708402223, "flos": 16252631481600.0, "grad_norm": 1.7857472181098575, "language_loss": 0.81315404, "learning_rate": 3.389002311256369e-06, "loss": 0.83463717, "num_input_tokens_seen": 99881540, "step": 4623, "time_per_iteration": 2.7112133502960205 }, { "auxiliary_loss_clip": 0.01119674, "auxiliary_loss_mlp": 0.01043259, "balance_loss_clip": 1.05434608, "balance_loss_mlp": 1.02628374, "epoch": 0.2780099203366902, "flos": 20667632791680.0, "grad_norm": 2.1551340516102897, "language_loss": 0.80889726, "learning_rate": 3.3887220705190204e-06, "loss": 0.83052659, "num_input_tokens_seen": 99899595, "step": 4624, "time_per_iteration": 2.6492481231689453 }, { "auxiliary_loss_clip": 0.01112812, "auxiliary_loss_mlp": 0.0077763, "balance_loss_clip": 1.05008531, "balance_loss_mlp": 1.00092447, "epoch": 0.27807004358935816, "flos": 17739583822080.0, "grad_norm": 2.21671742511245, "language_loss": 0.76949263, "learning_rate": 3.388441777121191e-06, "loss": 0.78839707, "num_input_tokens_seen": 99913020, "step": 4625, "time_per_iteration": 2.6312057971954346 }, { "auxiliary_loss_clip": 0.01106879, "auxiliary_loss_mlp": 0.01046687, "balance_loss_clip": 1.04205859, "balance_loss_mlp": 1.02767277, "epoch": 0.2781301668420261, "flos": 16727119165440.0, "grad_norm": 1.790813282848893, "language_loss": 0.69947815, "learning_rate": 3.388161431073511e-06, "loss": 0.72101378, "num_input_tokens_seen": 99931405, "step": 4626, "time_per_iteration": 2.7656819820404053 }, { "auxiliary_loss_clip": 0.0110548, "auxiliary_loss_mlp": 0.01041917, "balance_loss_clip": 1.04827905, "balance_loss_mlp": 1.02385652, "epoch": 0.27819029009469415, "flos": 13844749317120.0, "grad_norm": 2.1086116607571546, "language_loss": 0.92367601, "learning_rate": 3.38788103238661e-06, "loss": 0.94515002, "num_input_tokens_seen": 99948100, "step": 4627, "time_per_iteration": 2.8608667850494385 }, { "auxiliary_loss_clip": 0.01149683, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.05388021, "balance_loss_mlp": 1.0248611, "epoch": 0.2782504133473621, "flos": 27089286370560.0, "grad_norm": 1.7290354122756755, "language_loss": 0.85490036, "learning_rate": 3.387600581071121e-06, "loss": 0.87680495, "num_input_tokens_seen": 99966470, "step": 4628, "time_per_iteration": 2.6468069553375244 }, { "auxiliary_loss_clip": 0.01114712, "auxiliary_loss_mlp": 0.0104202, "balance_loss_clip": 1.0482378, "balance_loss_mlp": 1.02509212, "epoch": 0.2783105366000301, "flos": 21068826773760.0, "grad_norm": 1.5106040860694088, "language_loss": 0.79246545, "learning_rate": 3.387320077137679e-06, "loss": 0.81403273, "num_input_tokens_seen": 99985930, "step": 4629, "time_per_iteration": 5.656833648681641 }, { "auxiliary_loss_clip": 0.01100825, "auxiliary_loss_mlp": 0.01040328, "balance_loss_clip": 1.04602218, "balance_loss_mlp": 1.02339983, "epoch": 0.27837065985269804, "flos": 26501823434880.0, "grad_norm": 1.5125577415085874, "language_loss": 0.84574991, "learning_rate": 3.3870395205969208e-06, "loss": 0.86716145, "num_input_tokens_seen": 100006235, "step": 4630, "time_per_iteration": 2.70917010307312 }, { "auxiliary_loss_clip": 0.01123828, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.04848623, "balance_loss_mlp": 1.02099967, "epoch": 0.278430783105366, "flos": 20223201813120.0, "grad_norm": 2.1016222667741857, "language_loss": 0.81134796, "learning_rate": 3.386758911459485e-06, "loss": 0.83297169, "num_input_tokens_seen": 100023655, "step": 4631, "time_per_iteration": 4.19342041015625 }, { "auxiliary_loss_clip": 0.01149092, "auxiliary_loss_mlp": 0.01049428, "balance_loss_clip": 1.05402875, "balance_loss_mlp": 1.03257155, "epoch": 0.278490906358034, "flos": 25592888753280.0, "grad_norm": 3.9436500565538295, "language_loss": 0.71196103, "learning_rate": 3.3864782497360126e-06, "loss": 0.7339462, "num_input_tokens_seen": 100043280, "step": 4632, "time_per_iteration": 2.620439291000366 }, { "auxiliary_loss_clip": 0.01132813, "auxiliary_loss_mlp": 0.01044268, "balance_loss_clip": 1.05435467, "balance_loss_mlp": 1.02798355, "epoch": 0.27855102961070194, "flos": 16171544528640.0, "grad_norm": 1.8243983980851597, "language_loss": 0.82563186, "learning_rate": 3.386197535437145e-06, "loss": 0.84740269, "num_input_tokens_seen": 100057690, "step": 4633, "time_per_iteration": 2.6531693935394287 }, { "auxiliary_loss_clip": 0.01122775, "auxiliary_loss_mlp": 0.01039803, "balance_loss_clip": 1.04714537, "balance_loss_mlp": 1.02130151, "epoch": 0.2786111528633699, "flos": 22927598749440.0, "grad_norm": 1.6667943176882647, "language_loss": 0.87727869, "learning_rate": 3.385916768573529e-06, "loss": 0.89890444, "num_input_tokens_seen": 100075875, "step": 4634, "time_per_iteration": 4.391691446304321 }, { "auxiliary_loss_clip": 0.01118626, "auxiliary_loss_mlp": 0.01042889, "balance_loss_clip": 1.04900146, "balance_loss_mlp": 1.02503181, "epoch": 0.27867127611603787, "flos": 23404205335680.0, "grad_norm": 1.8664238108113964, "language_loss": 0.7701081, "learning_rate": 3.38563594915581e-06, "loss": 0.79172325, "num_input_tokens_seen": 100092930, "step": 4635, "time_per_iteration": 2.7107748985290527 }, { "auxiliary_loss_clip": 0.01148262, "auxiliary_loss_mlp": 0.01044984, "balance_loss_clip": 1.05233121, "balance_loss_mlp": 1.02705491, "epoch": 0.27873139936870583, "flos": 19829010983040.0, "grad_norm": 1.6280540509164947, "language_loss": 0.65174443, "learning_rate": 3.385355077194637e-06, "loss": 0.67367697, "num_input_tokens_seen": 100110790, "step": 4636, "time_per_iteration": 2.660099744796753 }, { "auxiliary_loss_clip": 0.01134021, "auxiliary_loss_mlp": 0.01042528, "balance_loss_clip": 1.048437, "balance_loss_mlp": 1.0243845, "epoch": 0.2787915226213738, "flos": 17707659609600.0, "grad_norm": 2.8501862977667667, "language_loss": 0.83485681, "learning_rate": 3.3850741527006604e-06, "loss": 0.85662234, "num_input_tokens_seen": 100126970, "step": 4637, "time_per_iteration": 2.6234302520751953 }, { "auxiliary_loss_clip": 0.01117465, "auxiliary_loss_mlp": 0.01043194, "balance_loss_clip": 1.04580319, "balance_loss_mlp": 1.02658796, "epoch": 0.27885164587404176, "flos": 22090557139200.0, "grad_norm": 1.4481958644660236, "language_loss": 0.75996393, "learning_rate": 3.384793175684533e-06, "loss": 0.78157055, "num_input_tokens_seen": 100146720, "step": 4638, "time_per_iteration": 2.6488263607025146 }, { "auxiliary_loss_clip": 0.0113367, "auxiliary_loss_mlp": 0.01047522, "balance_loss_clip": 1.04905438, "balance_loss_mlp": 1.02935445, "epoch": 0.27891176912670973, "flos": 19207684500480.0, "grad_norm": 1.973043880665722, "language_loss": 0.71658665, "learning_rate": 3.38451214615691e-06, "loss": 0.73839855, "num_input_tokens_seen": 100165920, "step": 4639, "time_per_iteration": 2.606290817260742 }, { "auxiliary_loss_clip": 0.01134631, "auxiliary_loss_mlp": 0.01040486, "balance_loss_clip": 1.04905224, "balance_loss_mlp": 1.02213931, "epoch": 0.27897189237937775, "flos": 27600007898880.0, "grad_norm": 1.9413688357819885, "language_loss": 0.6546669, "learning_rate": 3.384231064128447e-06, "loss": 0.67641807, "num_input_tokens_seen": 100185525, "step": 4640, "time_per_iteration": 2.670572280883789 }, { "auxiliary_loss_clip": 0.01134835, "auxiliary_loss_mlp": 0.01040753, "balance_loss_clip": 1.05033112, "balance_loss_mlp": 1.02394438, "epoch": 0.2790320156320457, "flos": 21178210665600.0, "grad_norm": 2.0528630099938385, "language_loss": 0.72150993, "learning_rate": 3.383949929609804e-06, "loss": 0.74326581, "num_input_tokens_seen": 100204850, "step": 4641, "time_per_iteration": 2.693377733230591 }, { "auxiliary_loss_clip": 0.01112862, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.05076349, "balance_loss_mlp": 1.02322423, "epoch": 0.2790921388847137, "flos": 22783920347520.0, "grad_norm": 1.7365449070814052, "language_loss": 0.74695385, "learning_rate": 3.383668742611641e-06, "loss": 0.7685138, "num_input_tokens_seen": 100224520, "step": 4642, "time_per_iteration": 2.7462241649627686 }, { "auxiliary_loss_clip": 0.0111075, "auxiliary_loss_mlp": 0.01045242, "balance_loss_clip": 1.04543257, "balance_loss_mlp": 1.02603781, "epoch": 0.27915226213738165, "flos": 23400649889280.0, "grad_norm": 1.8272594017764643, "language_loss": 0.85924351, "learning_rate": 3.3833875031446205e-06, "loss": 0.88080341, "num_input_tokens_seen": 100243935, "step": 4643, "time_per_iteration": 2.725135564804077 }, { "auxiliary_loss_clip": 0.01105223, "auxiliary_loss_mlp": 0.01045051, "balance_loss_clip": 1.04933143, "balance_loss_mlp": 1.02697933, "epoch": 0.2792123853900496, "flos": 22747794243840.0, "grad_norm": 1.7474380366240072, "language_loss": 0.83161986, "learning_rate": 3.383106211219407e-06, "loss": 0.85312265, "num_input_tokens_seen": 100262290, "step": 4644, "time_per_iteration": 2.7356133460998535 }, { "auxiliary_loss_clip": 0.01135825, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.04996896, "balance_loss_mlp": 1.02672005, "epoch": 0.2792725086427176, "flos": 15049372757760.0, "grad_norm": 1.8326156585035789, "language_loss": 0.79077673, "learning_rate": 3.3828248668466673e-06, "loss": 0.81257844, "num_input_tokens_seen": 100280015, "step": 4645, "time_per_iteration": 2.6605966091156006 }, { "auxiliary_loss_clip": 0.01043101, "auxiliary_loss_mlp": 0.01005168, "balance_loss_clip": 1.02972245, "balance_loss_mlp": 1.00273657, "epoch": 0.27933263189538554, "flos": 62544861757440.0, "grad_norm": 0.7804050577208047, "language_loss": 0.62298429, "learning_rate": 3.3825434700370705e-06, "loss": 0.64346695, "num_input_tokens_seen": 100338935, "step": 4646, "time_per_iteration": 3.203944206237793 }, { "auxiliary_loss_clip": 0.01116876, "auxiliary_loss_mlp": 0.01036795, "balance_loss_clip": 1.05170095, "balance_loss_mlp": 1.02054703, "epoch": 0.2793927551480535, "flos": 25118365155840.0, "grad_norm": 1.6679902986930268, "language_loss": 0.89280778, "learning_rate": 3.3822620208012865e-06, "loss": 0.91434449, "num_input_tokens_seen": 100359905, "step": 4647, "time_per_iteration": 2.829617500305176 }, { "auxiliary_loss_clip": 0.0113911, "auxiliary_loss_mlp": 0.01047084, "balance_loss_clip": 1.05125523, "balance_loss_mlp": 1.02880919, "epoch": 0.27945287840072147, "flos": 21324582587520.0, "grad_norm": 1.8012650128540075, "language_loss": 0.86784112, "learning_rate": 3.381980519149988e-06, "loss": 0.88970304, "num_input_tokens_seen": 100376955, "step": 4648, "time_per_iteration": 2.632321357727051 }, { "auxiliary_loss_clip": 0.01134603, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.05110133, "balance_loss_mlp": 1.02733302, "epoch": 0.27951300165338944, "flos": 27450547407360.0, "grad_norm": 2.0026822782024705, "language_loss": 0.73003638, "learning_rate": 3.38169896509385e-06, "loss": 0.75183129, "num_input_tokens_seen": 100397545, "step": 4649, "time_per_iteration": 2.7211172580718994 }, { "auxiliary_loss_clip": 0.01111127, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.04752195, "balance_loss_mlp": 1.02557421, "epoch": 0.2795731249060574, "flos": 15159008044800.0, "grad_norm": 2.1164331968139325, "language_loss": 0.80629992, "learning_rate": 3.381417358643549e-06, "loss": 0.82786095, "num_input_tokens_seen": 100415080, "step": 4650, "time_per_iteration": 2.7502310276031494 }, { "auxiliary_loss_clip": 0.01039445, "auxiliary_loss_mlp": 0.00754956, "balance_loss_clip": 1.03124094, "balance_loss_mlp": 1.00203133, "epoch": 0.27963324815872537, "flos": 60120103178880.0, "grad_norm": 0.8151234776797575, "language_loss": 0.58806145, "learning_rate": 3.3811356998097624e-06, "loss": 0.60600549, "num_input_tokens_seen": 100471105, "step": 4651, "time_per_iteration": 3.2224526405334473 }, { "auxiliary_loss_clip": 0.01135312, "auxiliary_loss_mlp": 0.01047398, "balance_loss_clip": 1.04708123, "balance_loss_mlp": 1.02753818, "epoch": 0.27969337141139333, "flos": 21765960910080.0, "grad_norm": 1.7351399642666463, "language_loss": 0.74332011, "learning_rate": 3.3808539886031726e-06, "loss": 0.76514727, "num_input_tokens_seen": 100492520, "step": 4652, "time_per_iteration": 2.685736894607544 }, { "auxiliary_loss_clip": 0.01148943, "auxiliary_loss_mlp": 0.01045678, "balance_loss_clip": 1.05235481, "balance_loss_mlp": 1.02742696, "epoch": 0.27975349466406135, "flos": 39851398834560.0, "grad_norm": 2.2003219434248633, "language_loss": 0.79789567, "learning_rate": 3.380572225034461e-06, "loss": 0.81984192, "num_input_tokens_seen": 100512870, "step": 4653, "time_per_iteration": 2.7558584213256836 }, { "auxiliary_loss_clip": 0.01121239, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.04883742, "balance_loss_mlp": 1.03280401, "epoch": 0.2798136179167293, "flos": 21579799697280.0, "grad_norm": 2.080129868341082, "language_loss": 0.78903222, "learning_rate": 3.380290409114312e-06, "loss": 0.81074733, "num_input_tokens_seen": 100531655, "step": 4654, "time_per_iteration": 2.6496095657348633 }, { "auxiliary_loss_clip": 0.01101836, "auxiliary_loss_mlp": 0.01052085, "balance_loss_clip": 1.04982615, "balance_loss_mlp": 1.03267753, "epoch": 0.2798737411693973, "flos": 21537676022400.0, "grad_norm": 2.0985102630300134, "language_loss": 0.81319463, "learning_rate": 3.3800085408534127e-06, "loss": 0.83473378, "num_input_tokens_seen": 100548005, "step": 4655, "time_per_iteration": 2.742586135864258 }, { "auxiliary_loss_clip": 0.01112605, "auxiliary_loss_mlp": 0.00776867, "balance_loss_clip": 1.04759109, "balance_loss_mlp": 1.00071263, "epoch": 0.27993386442206525, "flos": 26981051713920.0, "grad_norm": 1.7515804597190672, "language_loss": 0.81455064, "learning_rate": 3.3797266202624506e-06, "loss": 0.83344543, "num_input_tokens_seen": 100567980, "step": 4656, "time_per_iteration": 2.796480894088745 }, { "auxiliary_loss_clip": 0.01120191, "auxiliary_loss_mlp": 0.01050328, "balance_loss_clip": 1.05115008, "balance_loss_mlp": 1.03204143, "epoch": 0.2799939876747332, "flos": 24349876652160.0, "grad_norm": 2.044588364139205, "language_loss": 0.83203471, "learning_rate": 3.3794446473521176e-06, "loss": 0.85373986, "num_input_tokens_seen": 100588630, "step": 4657, "time_per_iteration": 2.6785871982574463 }, { "auxiliary_loss_clip": 0.01111476, "auxiliary_loss_mlp": 0.01052182, "balance_loss_clip": 1.04937756, "balance_loss_mlp": 1.03294206, "epoch": 0.2800541109274012, "flos": 33656988648960.0, "grad_norm": 2.165484252442401, "language_loss": 0.63694274, "learning_rate": 3.379162622133105e-06, "loss": 0.65857935, "num_input_tokens_seen": 100608775, "step": 4658, "time_per_iteration": 2.879409074783325 }, { "auxiliary_loss_clip": 0.01136248, "auxiliary_loss_mlp": 0.010462, "balance_loss_clip": 1.0495683, "balance_loss_mlp": 1.02822304, "epoch": 0.28011423418006914, "flos": 21614417429760.0, "grad_norm": 1.7192056687926605, "language_loss": 0.78342974, "learning_rate": 3.3788805446161073e-06, "loss": 0.80525422, "num_input_tokens_seen": 100627975, "step": 4659, "time_per_iteration": 2.6989047527313232 }, { "auxiliary_loss_clip": 0.0111004, "auxiliary_loss_mlp": 0.01054733, "balance_loss_clip": 1.04974771, "balance_loss_mlp": 1.03588593, "epoch": 0.2801743574327371, "flos": 23112431159040.0, "grad_norm": 1.755148683242289, "language_loss": 0.79341501, "learning_rate": 3.3785984148118215e-06, "loss": 0.8150627, "num_input_tokens_seen": 100645430, "step": 4660, "time_per_iteration": 2.715477705001831 }, { "auxiliary_loss_clip": 0.01108147, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.05007386, "balance_loss_mlp": 1.02897, "epoch": 0.2802344806854051, "flos": 12641418766080.0, "grad_norm": 2.2526204230687115, "language_loss": 0.80604905, "learning_rate": 3.3783162327309453e-06, "loss": 0.82759559, "num_input_tokens_seen": 100663775, "step": 4661, "time_per_iteration": 2.7715258598327637 }, { "auxiliary_loss_clip": 0.01125452, "auxiliary_loss_mlp": 0.01056292, "balance_loss_clip": 1.05232596, "balance_loss_mlp": 1.03836262, "epoch": 0.28029460393807304, "flos": 37267878142080.0, "grad_norm": 1.5529278028038542, "language_loss": 0.79010582, "learning_rate": 3.3780339983841794e-06, "loss": 0.81192333, "num_input_tokens_seen": 100686085, "step": 4662, "time_per_iteration": 2.81427264213562 }, { "auxiliary_loss_clip": 0.01133119, "auxiliary_loss_mlp": 0.01052014, "balance_loss_clip": 1.05226839, "balance_loss_mlp": 1.03252363, "epoch": 0.280354727190741, "flos": 20741106061440.0, "grad_norm": 1.6202884167711182, "language_loss": 0.69617724, "learning_rate": 3.377751711782227e-06, "loss": 0.71802866, "num_input_tokens_seen": 100705135, "step": 4663, "time_per_iteration": 2.697368860244751 }, { "auxiliary_loss_clip": 0.01124677, "auxiliary_loss_mlp": 0.01049339, "balance_loss_clip": 1.05170035, "balance_loss_mlp": 1.03104067, "epoch": 0.28041485044340897, "flos": 21471026336640.0, "grad_norm": 1.9196144000248758, "language_loss": 0.77708608, "learning_rate": 3.377469372935791e-06, "loss": 0.79882622, "num_input_tokens_seen": 100724960, "step": 4664, "time_per_iteration": 2.7275149822235107 }, { "auxiliary_loss_clip": 0.01107718, "auxiliary_loss_mlp": 0.01048769, "balance_loss_clip": 1.0480299, "balance_loss_mlp": 1.03099537, "epoch": 0.28047497369607693, "flos": 14794263388800.0, "grad_norm": 1.999889511399453, "language_loss": 0.79593849, "learning_rate": 3.377186981855578e-06, "loss": 0.81750339, "num_input_tokens_seen": 100741995, "step": 4665, "time_per_iteration": 2.710507392883301 }, { "auxiliary_loss_clip": 0.01132609, "auxiliary_loss_mlp": 0.01044622, "balance_loss_clip": 1.04908824, "balance_loss_mlp": 1.02724159, "epoch": 0.2805350969487449, "flos": 23070738447360.0, "grad_norm": 1.8624041004678782, "language_loss": 0.81080002, "learning_rate": 3.3769045385522968e-06, "loss": 0.83257234, "num_input_tokens_seen": 100758985, "step": 4666, "time_per_iteration": 2.6129403114318848 }, { "auxiliary_loss_clip": 0.01108409, "auxiliary_loss_mlp": 0.01071225, "balance_loss_clip": 1.04823136, "balance_loss_mlp": 1.05097127, "epoch": 0.2805952202014129, "flos": 20479855466880.0, "grad_norm": 2.103406835637469, "language_loss": 0.84507895, "learning_rate": 3.376622043036658e-06, "loss": 0.86687529, "num_input_tokens_seen": 100777820, "step": 4667, "time_per_iteration": 2.7332448959350586 }, { "auxiliary_loss_clip": 0.01123034, "auxiliary_loss_mlp": 0.00775483, "balance_loss_clip": 1.05581784, "balance_loss_mlp": 1.00072694, "epoch": 0.2806553434540809, "flos": 27417330305280.0, "grad_norm": 3.1307253624061486, "language_loss": 0.79295927, "learning_rate": 3.376339495319373e-06, "loss": 0.81194448, "num_input_tokens_seen": 100798205, "step": 4668, "time_per_iteration": 5.80406928062439 }, { "auxiliary_loss_clip": 0.01086886, "auxiliary_loss_mlp": 0.01042603, "balance_loss_clip": 1.04659402, "balance_loss_mlp": 1.02432859, "epoch": 0.28071546670674885, "flos": 26505019745280.0, "grad_norm": 1.6340052887006857, "language_loss": 0.76323926, "learning_rate": 3.3760568954111563e-06, "loss": 0.7845341, "num_input_tokens_seen": 100819800, "step": 4669, "time_per_iteration": 2.909986734390259 }, { "auxiliary_loss_clip": 0.01135126, "auxiliary_loss_mlp": 0.01048727, "balance_loss_clip": 1.05091906, "balance_loss_mlp": 1.03104806, "epoch": 0.2807755899594168, "flos": 20558679863040.0, "grad_norm": 2.509610012971093, "language_loss": 0.79246378, "learning_rate": 3.375774243322725e-06, "loss": 0.81430233, "num_input_tokens_seen": 100837880, "step": 4670, "time_per_iteration": 4.177394866943359 }, { "auxiliary_loss_clip": 0.01106377, "auxiliary_loss_mlp": 0.01050214, "balance_loss_clip": 1.04797912, "balance_loss_mlp": 1.03053236, "epoch": 0.2808357132120848, "flos": 24313319585280.0, "grad_norm": 2.7368773080153455, "language_loss": 0.79247916, "learning_rate": 3.3754915390647955e-06, "loss": 0.81404507, "num_input_tokens_seen": 100856350, "step": 4671, "time_per_iteration": 2.711390256881714 }, { "auxiliary_loss_clip": 0.01127751, "auxiliary_loss_mlp": 0.01045588, "balance_loss_clip": 1.05121446, "balance_loss_mlp": 1.02806473, "epoch": 0.28089583646475275, "flos": 26432408401920.0, "grad_norm": 1.6750085767967255, "language_loss": 0.74537772, "learning_rate": 3.37520878264809e-06, "loss": 0.76711112, "num_input_tokens_seen": 100876135, "step": 4672, "time_per_iteration": 2.661121129989624 }, { "auxiliary_loss_clip": 0.01124033, "auxiliary_loss_mlp": 0.01050888, "balance_loss_clip": 1.04696918, "balance_loss_mlp": 1.03130245, "epoch": 0.2809559597174207, "flos": 23111820627840.0, "grad_norm": 2.8450273884489805, "language_loss": 0.75648308, "learning_rate": 3.3749259740833286e-06, "loss": 0.77823234, "num_input_tokens_seen": 100894790, "step": 4673, "time_per_iteration": 2.672701120376587 }, { "auxiliary_loss_clip": 0.0113134, "auxiliary_loss_mlp": 0.01042591, "balance_loss_clip": 1.04937172, "balance_loss_mlp": 1.02492452, "epoch": 0.2810160829700887, "flos": 20923496346240.0, "grad_norm": 1.8533271967959946, "language_loss": 0.72668427, "learning_rate": 3.374643113381237e-06, "loss": 0.74842358, "num_input_tokens_seen": 100915100, "step": 4674, "time_per_iteration": 4.2516560554504395 }, { "auxiliary_loss_clip": 0.01138771, "auxiliary_loss_mlp": 0.01046386, "balance_loss_clip": 1.05174136, "balance_loss_mlp": 1.02751493, "epoch": 0.28107620622275664, "flos": 14355901808640.0, "grad_norm": 2.0688845921593377, "language_loss": 0.77195638, "learning_rate": 3.374360200552541e-06, "loss": 0.79380798, "num_input_tokens_seen": 100932795, "step": 4675, "time_per_iteration": 2.618218183517456 }, { "auxiliary_loss_clip": 0.01149881, "auxiliary_loss_mlp": 0.01047998, "balance_loss_clip": 1.05321908, "balance_loss_mlp": 1.02948523, "epoch": 0.2811363294754246, "flos": 20919078973440.0, "grad_norm": 1.9283078401930889, "language_loss": 0.70211101, "learning_rate": 3.374077235607968e-06, "loss": 0.7240898, "num_input_tokens_seen": 100950505, "step": 4676, "time_per_iteration": 2.59861159324646 }, { "auxiliary_loss_clip": 0.01144319, "auxiliary_loss_mlp": 0.01042342, "balance_loss_clip": 1.05481541, "balance_loss_mlp": 1.02517629, "epoch": 0.28119645272809257, "flos": 20594841880320.0, "grad_norm": 1.6132814643409343, "language_loss": 0.7048012, "learning_rate": 3.3737942185582487e-06, "loss": 0.72666782, "num_input_tokens_seen": 100968790, "step": 4677, "time_per_iteration": 2.6064453125 }, { "auxiliary_loss_clip": 0.01125461, "auxiliary_loss_mlp": 0.01047839, "balance_loss_clip": 1.04849231, "balance_loss_mlp": 1.02783537, "epoch": 0.28125657598076054, "flos": 25337420248320.0, "grad_norm": 1.5663130673511025, "language_loss": 0.639018, "learning_rate": 3.3735111494141153e-06, "loss": 0.66075099, "num_input_tokens_seen": 100990205, "step": 4678, "time_per_iteration": 2.6609809398651123 }, { "auxiliary_loss_clip": 0.01134563, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.05104351, "balance_loss_mlp": 1.03315794, "epoch": 0.2813166992334285, "flos": 24827093769600.0, "grad_norm": 5.827919401990006, "language_loss": 0.70568973, "learning_rate": 3.3732280281863013e-06, "loss": 0.72753799, "num_input_tokens_seen": 101009815, "step": 4679, "time_per_iteration": 2.7039310932159424 }, { "auxiliary_loss_clip": 0.01134537, "auxiliary_loss_mlp": 0.01040896, "balance_loss_clip": 1.05048108, "balance_loss_mlp": 1.02283621, "epoch": 0.2813768224860965, "flos": 21760753438080.0, "grad_norm": 2.2073803144691255, "language_loss": 0.74848735, "learning_rate": 3.3729448548855422e-06, "loss": 0.77024174, "num_input_tokens_seen": 101026780, "step": 4680, "time_per_iteration": 2.6897919178009033 }, { "auxiliary_loss_clip": 0.01149427, "auxiliary_loss_mlp": 0.01039945, "balance_loss_clip": 1.05414999, "balance_loss_mlp": 1.02363694, "epoch": 0.2814369457387645, "flos": 24316803204480.0, "grad_norm": 2.2743778704427267, "language_loss": 0.7719292, "learning_rate": 3.3726616295225774e-06, "loss": 0.793823, "num_input_tokens_seen": 101046215, "step": 4681, "time_per_iteration": 2.6178102493286133 }, { "auxiliary_loss_clip": 0.01138594, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.05333447, "balance_loss_mlp": 1.01864183, "epoch": 0.28149706899143245, "flos": 18515326872960.0, "grad_norm": 2.5230258038951723, "language_loss": 0.74197519, "learning_rate": 3.372378352108146e-06, "loss": 0.76373291, "num_input_tokens_seen": 101063365, "step": 4682, "time_per_iteration": 2.5892751216888428 }, { "auxiliary_loss_clip": 0.01145225, "auxiliary_loss_mlp": 0.01043744, "balance_loss_clip": 1.05250573, "balance_loss_mlp": 1.02619636, "epoch": 0.2815571922441004, "flos": 24863255786880.0, "grad_norm": 1.5493572746384299, "language_loss": 0.81096184, "learning_rate": 3.3720950226529894e-06, "loss": 0.83285153, "num_input_tokens_seen": 101083835, "step": 4683, "time_per_iteration": 2.6272947788238525 }, { "auxiliary_loss_clip": 0.01089095, "auxiliary_loss_mlp": 0.01048071, "balance_loss_clip": 1.04691851, "balance_loss_mlp": 1.02916479, "epoch": 0.2816173154967684, "flos": 19901622326400.0, "grad_norm": 1.5570192452178944, "language_loss": 0.76437271, "learning_rate": 3.371811641167852e-06, "loss": 0.78574431, "num_input_tokens_seen": 101101740, "step": 4684, "time_per_iteration": 2.7542243003845215 }, { "auxiliary_loss_clip": 0.01090035, "auxiliary_loss_mlp": 0.01043858, "balance_loss_clip": 1.04495156, "balance_loss_mlp": 1.02659678, "epoch": 0.28167743874943635, "flos": 17491333950720.0, "grad_norm": 3.250404845672824, "language_loss": 0.76287019, "learning_rate": 3.3715282076634807e-06, "loss": 0.78420913, "num_input_tokens_seen": 101120480, "step": 4685, "time_per_iteration": 2.724954843521118 }, { "auxiliary_loss_clip": 0.01116834, "auxiliary_loss_mlp": 0.01045285, "balance_loss_clip": 1.05042076, "balance_loss_mlp": 1.02820265, "epoch": 0.2817375620021043, "flos": 25302120157440.0, "grad_norm": 1.80192319881426, "language_loss": 0.75822544, "learning_rate": 3.3712447221506218e-06, "loss": 0.77984667, "num_input_tokens_seen": 101142910, "step": 4686, "time_per_iteration": 2.7375218868255615 }, { "auxiliary_loss_clip": 0.01113965, "auxiliary_loss_mlp": 0.01054481, "balance_loss_clip": 1.04542971, "balance_loss_mlp": 1.03530002, "epoch": 0.2817976852547723, "flos": 18693227957760.0, "grad_norm": 5.9534421572259095, "language_loss": 0.62298906, "learning_rate": 3.370961184640025e-06, "loss": 0.64467359, "num_input_tokens_seen": 101160030, "step": 4687, "time_per_iteration": 2.7273154258728027 }, { "auxiliary_loss_clip": 0.01125077, "auxiliary_loss_mlp": 0.01052662, "balance_loss_clip": 1.05122471, "balance_loss_mlp": 1.03501928, "epoch": 0.28185780850744024, "flos": 22742263549440.0, "grad_norm": 3.512847657951686, "language_loss": 0.76642895, "learning_rate": 3.3706775951424433e-06, "loss": 0.78820634, "num_input_tokens_seen": 101177675, "step": 4688, "time_per_iteration": 2.6962485313415527 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01038903, "balance_loss_clip": 1.050143, "balance_loss_mlp": 1.0222497, "epoch": 0.2819179317601082, "flos": 14933919467520.0, "grad_norm": 2.029299855452059, "language_loss": 0.78377295, "learning_rate": 3.37039395366863e-06, "loss": 0.80527258, "num_input_tokens_seen": 101192225, "step": 4689, "time_per_iteration": 2.7611160278320312 }, { "auxiliary_loss_clip": 0.01101002, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.044873, "balance_loss_mlp": 1.02469492, "epoch": 0.2819780550127762, "flos": 23145325038720.0, "grad_norm": 1.6619977361488503, "language_loss": 0.78151089, "learning_rate": 3.37011026022934e-06, "loss": 0.80294096, "num_input_tokens_seen": 101210870, "step": 4690, "time_per_iteration": 2.8166253566741943 }, { "auxiliary_loss_clip": 0.01144307, "auxiliary_loss_mlp": 0.0077562, "balance_loss_clip": 1.04972041, "balance_loss_mlp": 1.00065684, "epoch": 0.28203817826544414, "flos": 21616356764160.0, "grad_norm": 1.8251699545436237, "language_loss": 0.87835205, "learning_rate": 3.369826514835332e-06, "loss": 0.8975513, "num_input_tokens_seen": 101229965, "step": 4691, "time_per_iteration": 2.755540609359741 }, { "auxiliary_loss_clip": 0.01120177, "auxiliary_loss_mlp": 0.01057161, "balance_loss_clip": 1.0480932, "balance_loss_mlp": 1.03866005, "epoch": 0.2820983015181121, "flos": 24026788794240.0, "grad_norm": 2.0164591316320086, "language_loss": 0.81783265, "learning_rate": 3.3695427174973654e-06, "loss": 0.83960605, "num_input_tokens_seen": 101250980, "step": 4692, "time_per_iteration": 2.766826868057251 }, { "auxiliary_loss_clip": 0.01108273, "auxiliary_loss_mlp": 0.01044592, "balance_loss_clip": 1.05000174, "balance_loss_mlp": 1.02690101, "epoch": 0.2821584247707801, "flos": 30007925976960.0, "grad_norm": 1.5153062693168577, "language_loss": 0.74520338, "learning_rate": 3.3692588682262022e-06, "loss": 0.76673198, "num_input_tokens_seen": 101273335, "step": 4693, "time_per_iteration": 2.833829402923584 }, { "auxiliary_loss_clip": 0.01107692, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.04546356, "balance_loss_mlp": 1.02018356, "epoch": 0.2822185480234481, "flos": 21396762967680.0, "grad_norm": 1.6139880108231377, "language_loss": 0.77396065, "learning_rate": 3.3689749670326046e-06, "loss": 0.79542327, "num_input_tokens_seen": 101292110, "step": 4694, "time_per_iteration": 2.6783409118652344 }, { "auxiliary_loss_clip": 0.01131719, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.05066633, "balance_loss_mlp": 1.02610695, "epoch": 0.28227867127611606, "flos": 27452809964160.0, "grad_norm": 2.1245298140537354, "language_loss": 0.67171001, "learning_rate": 3.3686910139273392e-06, "loss": 0.69346148, "num_input_tokens_seen": 101312815, "step": 4695, "time_per_iteration": 2.657508373260498 }, { "auxiliary_loss_clip": 0.01129418, "auxiliary_loss_mlp": 0.01047718, "balance_loss_clip": 1.05160189, "balance_loss_mlp": 1.02857292, "epoch": 0.282338794528784, "flos": 22593736811520.0, "grad_norm": 2.1132011275006297, "language_loss": 0.75410438, "learning_rate": 3.3684070089211736e-06, "loss": 0.77587581, "num_input_tokens_seen": 101329045, "step": 4696, "time_per_iteration": 2.6419622898101807 }, { "auxiliary_loss_clip": 0.01108873, "auxiliary_loss_mlp": 0.01050131, "balance_loss_clip": 1.04857826, "balance_loss_mlp": 1.03241634, "epoch": 0.282398917781452, "flos": 42010923386880.0, "grad_norm": 1.6547739374499746, "language_loss": 0.62379837, "learning_rate": 3.368122952024877e-06, "loss": 0.64538848, "num_input_tokens_seen": 101352715, "step": 4697, "time_per_iteration": 2.863271951675415 }, { "auxiliary_loss_clip": 0.01098306, "auxiliary_loss_mlp": 0.01038026, "balance_loss_clip": 1.04702902, "balance_loss_mlp": 1.0213964, "epoch": 0.28245904103411995, "flos": 23224724052480.0, "grad_norm": 1.3648463295211168, "language_loss": 0.73178887, "learning_rate": 3.3678388432492214e-06, "loss": 0.75315219, "num_input_tokens_seen": 101374640, "step": 4698, "time_per_iteration": 2.7437515258789062 }, { "auxiliary_loss_clip": 0.01138661, "auxiliary_loss_mlp": 0.01044687, "balance_loss_clip": 1.04783368, "balance_loss_mlp": 1.02820039, "epoch": 0.2825191642867879, "flos": 25374623760000.0, "grad_norm": 1.73143255072412, "language_loss": 0.75260699, "learning_rate": 3.3675546826049788e-06, "loss": 0.77444041, "num_input_tokens_seen": 101393595, "step": 4699, "time_per_iteration": 2.6352651119232178 }, { "auxiliary_loss_clip": 0.01130406, "auxiliary_loss_mlp": 0.01042781, "balance_loss_clip": 1.04642487, "balance_loss_mlp": 1.02379072, "epoch": 0.2825792875394559, "flos": 17236799199360.0, "grad_norm": 2.939003683920128, "language_loss": 0.80683541, "learning_rate": 3.3672704701029265e-06, "loss": 0.82856727, "num_input_tokens_seen": 101409265, "step": 4700, "time_per_iteration": 2.597543478012085 }, { "auxiliary_loss_clip": 0.01118395, "auxiliary_loss_mlp": 0.01052226, "balance_loss_clip": 1.05168593, "balance_loss_mlp": 1.03699148, "epoch": 0.28263941079212385, "flos": 26723967096960.0, "grad_norm": 1.8973185440197946, "language_loss": 0.82377315, "learning_rate": 3.3669862057538402e-06, "loss": 0.84547931, "num_input_tokens_seen": 101428365, "step": 4701, "time_per_iteration": 2.6613359451293945 }, { "auxiliary_loss_clip": 0.01079732, "auxiliary_loss_mlp": 0.01044955, "balance_loss_clip": 1.04725862, "balance_loss_mlp": 1.02782488, "epoch": 0.2826995340447918, "flos": 25921327737600.0, "grad_norm": 2.6106451650427913, "language_loss": 0.72911763, "learning_rate": 3.3667018895685004e-06, "loss": 0.75036454, "num_input_tokens_seen": 101447280, "step": 4702, "time_per_iteration": 2.927156448364258 }, { "auxiliary_loss_clip": 0.0114189, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.05118549, "balance_loss_mlp": 1.02240694, "epoch": 0.2827596572974598, "flos": 22379709623040.0, "grad_norm": 2.1110096252533754, "language_loss": 0.78497601, "learning_rate": 3.3664175215576886e-06, "loss": 0.80678773, "num_input_tokens_seen": 101465435, "step": 4703, "time_per_iteration": 2.603217124938965 }, { "auxiliary_loss_clip": 0.01115372, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.04668045, "balance_loss_mlp": 1.03100109, "epoch": 0.28281978055012774, "flos": 33547137880320.0, "grad_norm": 1.6207045759516274, "language_loss": 0.69310379, "learning_rate": 3.3661331017321867e-06, "loss": 0.71475154, "num_input_tokens_seen": 101486355, "step": 4704, "time_per_iteration": 2.737741708755493 }, { "auxiliary_loss_clip": 0.0110991, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.05106401, "balance_loss_mlp": 1.02204967, "epoch": 0.2828799038027957, "flos": 23440870143360.0, "grad_norm": 2.0629797483939893, "language_loss": 0.70487976, "learning_rate": 3.3658486301027807e-06, "loss": 0.72637939, "num_input_tokens_seen": 101505875, "step": 4705, "time_per_iteration": 2.7810943126678467 }, { "auxiliary_loss_clip": 0.01051193, "auxiliary_loss_mlp": 0.01011527, "balance_loss_clip": 1.02885246, "balance_loss_mlp": 1.00905895, "epoch": 0.2829400270554637, "flos": 69873690251520.0, "grad_norm": 0.7331461257989402, "language_loss": 0.59262896, "learning_rate": 3.3655641066802577e-06, "loss": 0.6132561, "num_input_tokens_seen": 101565045, "step": 4706, "time_per_iteration": 3.223500967025757 }, { "auxiliary_loss_clip": 0.01117208, "auxiliary_loss_mlp": 0.01042955, "balance_loss_clip": 1.04750693, "balance_loss_mlp": 1.02711248, "epoch": 0.2830001503081317, "flos": 24789028331520.0, "grad_norm": 1.4542369915695899, "language_loss": 0.82314008, "learning_rate": 3.365279531475407e-06, "loss": 0.84474176, "num_input_tokens_seen": 101585825, "step": 4707, "time_per_iteration": 5.995711326599121 }, { "auxiliary_loss_clip": 0.0112325, "auxiliary_loss_mlp": 0.01043198, "balance_loss_clip": 1.04714823, "balance_loss_mlp": 1.02451742, "epoch": 0.28306027356079966, "flos": 27669387018240.0, "grad_norm": 1.6937335335925583, "language_loss": 0.80196846, "learning_rate": 3.36499490449902e-06, "loss": 0.82363296, "num_input_tokens_seen": 101606105, "step": 4708, "time_per_iteration": 2.730365753173828 }, { "auxiliary_loss_clip": 0.01036827, "auxiliary_loss_mlp": 0.01004906, "balance_loss_clip": 1.0241586, "balance_loss_mlp": 1.00274837, "epoch": 0.2831203968134676, "flos": 60527938199040.0, "grad_norm": 0.8797441515413378, "language_loss": 0.62768304, "learning_rate": 3.3647102257618895e-06, "loss": 0.64810038, "num_input_tokens_seen": 101656875, "step": 4709, "time_per_iteration": 3.0734164714813232 }, { "auxiliary_loss_clip": 0.01113275, "auxiliary_loss_mlp": 0.01045412, "balance_loss_clip": 1.04819441, "balance_loss_mlp": 1.02711344, "epoch": 0.2831805200661356, "flos": 22054790171520.0, "grad_norm": 1.4416556980461737, "language_loss": 0.74092108, "learning_rate": 3.3644254952748103e-06, "loss": 0.76250798, "num_input_tokens_seen": 101676225, "step": 4710, "time_per_iteration": 4.214928388595581 }, { "auxiliary_loss_clip": 0.01108833, "auxiliary_loss_mlp": 0.01058426, "balance_loss_clip": 1.04568553, "balance_loss_mlp": 1.0393765, "epoch": 0.28324064331880355, "flos": 22600668136320.0, "grad_norm": 2.192994300890924, "language_loss": 0.7857554, "learning_rate": 3.364140713048579e-06, "loss": 0.80742794, "num_input_tokens_seen": 101693710, "step": 4711, "time_per_iteration": 2.9334824085235596 }, { "auxiliary_loss_clip": 0.01135754, "auxiliary_loss_mlp": 0.00775746, "balance_loss_clip": 1.05244637, "balance_loss_mlp": 1.00072622, "epoch": 0.2833007665714715, "flos": 30404127968640.0, "grad_norm": 2.328121287113732, "language_loss": 0.70832199, "learning_rate": 3.363855879093996e-06, "loss": 0.72743702, "num_input_tokens_seen": 101714010, "step": 4712, "time_per_iteration": 2.8570704460144043 }, { "auxiliary_loss_clip": 0.0114641, "auxiliary_loss_mlp": 0.01050688, "balance_loss_clip": 1.05171633, "balance_loss_mlp": 1.03284216, "epoch": 0.2833608898241395, "flos": 23549499849600.0, "grad_norm": 2.3843934106626157, "language_loss": 0.81725228, "learning_rate": 3.3635709934218605e-06, "loss": 0.83922327, "num_input_tokens_seen": 101732995, "step": 4713, "time_per_iteration": 4.343034029006958 }, { "auxiliary_loss_clip": 0.01120505, "auxiliary_loss_mlp": 0.01048075, "balance_loss_clip": 1.05054498, "balance_loss_mlp": 1.03044379, "epoch": 0.28342101307680745, "flos": 20266726118400.0, "grad_norm": 1.7964609324305687, "language_loss": 0.75316995, "learning_rate": 3.3632860560429766e-06, "loss": 0.77485573, "num_input_tokens_seen": 101751385, "step": 4714, "time_per_iteration": 2.656919479370117 }, { "auxiliary_loss_clip": 0.01129168, "auxiliary_loss_mlp": 0.01051102, "balance_loss_clip": 1.050372, "balance_loss_mlp": 1.03424633, "epoch": 0.2834811363294754, "flos": 30847050576000.0, "grad_norm": 1.4082553086863412, "language_loss": 0.78457153, "learning_rate": 3.3630010669681494e-06, "loss": 0.80637431, "num_input_tokens_seen": 101773825, "step": 4715, "time_per_iteration": 2.721869468688965 }, { "auxiliary_loss_clip": 0.01117334, "auxiliary_loss_mlp": 0.01046437, "balance_loss_clip": 1.04618871, "balance_loss_mlp": 1.0294199, "epoch": 0.2835412595821434, "flos": 22711021695360.0, "grad_norm": 1.791082386208426, "language_loss": 0.73825723, "learning_rate": 3.3627160262081845e-06, "loss": 0.75989497, "num_input_tokens_seen": 101791920, "step": 4716, "time_per_iteration": 2.689964532852173 }, { "auxiliary_loss_clip": 0.0111778, "auxiliary_loss_mlp": 0.01054857, "balance_loss_clip": 1.04580188, "balance_loss_mlp": 1.03397131, "epoch": 0.28360138283481134, "flos": 18077719478400.0, "grad_norm": 2.1425450832247868, "language_loss": 0.74293232, "learning_rate": 3.3624309337738917e-06, "loss": 0.76465869, "num_input_tokens_seen": 101809515, "step": 4717, "time_per_iteration": 2.653107166290283 }, { "auxiliary_loss_clip": 0.01112398, "auxiliary_loss_mlp": 0.01052347, "balance_loss_clip": 1.04736984, "balance_loss_mlp": 1.03526437, "epoch": 0.2836615060874793, "flos": 17854785717120.0, "grad_norm": 1.96982951308544, "language_loss": 0.67022157, "learning_rate": 3.3621457896760813e-06, "loss": 0.69186902, "num_input_tokens_seen": 101827735, "step": 4718, "time_per_iteration": 2.7287323474884033 }, { "auxiliary_loss_clip": 0.01119996, "auxiliary_loss_mlp": 0.01052629, "balance_loss_clip": 1.04606366, "balance_loss_mlp": 1.03479528, "epoch": 0.2837216293401473, "flos": 25740302169600.0, "grad_norm": 1.7409435577223806, "language_loss": 0.72453725, "learning_rate": 3.361860593925566e-06, "loss": 0.7462635, "num_input_tokens_seen": 101845970, "step": 4719, "time_per_iteration": 2.7101874351501465 }, { "auxiliary_loss_clip": 0.01129472, "auxiliary_loss_mlp": 0.01044, "balance_loss_clip": 1.04724336, "balance_loss_mlp": 1.02711964, "epoch": 0.2837817525928153, "flos": 20923532259840.0, "grad_norm": 1.8163652523997504, "language_loss": 0.80517805, "learning_rate": 3.3615753465331605e-06, "loss": 0.82691276, "num_input_tokens_seen": 101865040, "step": 4720, "time_per_iteration": 2.630380392074585 }, { "auxiliary_loss_clip": 0.01130938, "auxiliary_loss_mlp": 0.01047274, "balance_loss_clip": 1.04798317, "balance_loss_mlp": 1.02935672, "epoch": 0.28384187584548326, "flos": 18916700423040.0, "grad_norm": 2.340232614040239, "language_loss": 0.79146183, "learning_rate": 3.3612900475096817e-06, "loss": 0.81324387, "num_input_tokens_seen": 101883735, "step": 4721, "time_per_iteration": 2.6779117584228516 }, { "auxiliary_loss_clip": 0.01091324, "auxiliary_loss_mlp": 0.00778191, "balance_loss_clip": 1.04653215, "balance_loss_mlp": 1.00074911, "epoch": 0.2839019990981512, "flos": 27343964776320.0, "grad_norm": 1.7859505861297744, "language_loss": 0.82514244, "learning_rate": 3.3610046968659474e-06, "loss": 0.84383762, "num_input_tokens_seen": 101903025, "step": 4722, "time_per_iteration": 2.8601412773132324 }, { "auxiliary_loss_clip": 0.0114735, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.05396807, "balance_loss_mlp": 1.02641416, "epoch": 0.2839621223508192, "flos": 18114312458880.0, "grad_norm": 1.8976073667217488, "language_loss": 0.70048773, "learning_rate": 3.3607192946127785e-06, "loss": 0.72239512, "num_input_tokens_seen": 101922255, "step": 4723, "time_per_iteration": 2.6259007453918457 }, { "auxiliary_loss_clip": 0.0111455, "auxiliary_loss_mlp": 0.01051142, "balance_loss_clip": 1.04818106, "balance_loss_mlp": 1.03247368, "epoch": 0.28402224560348716, "flos": 26358360514560.0, "grad_norm": 1.540245146059843, "language_loss": 0.78676599, "learning_rate": 3.360433840760998e-06, "loss": 0.80842292, "num_input_tokens_seen": 101943100, "step": 4724, "time_per_iteration": 2.7364859580993652 }, { "auxiliary_loss_clip": 0.01116323, "auxiliary_loss_mlp": 0.01063488, "balance_loss_clip": 1.04846072, "balance_loss_mlp": 1.04442668, "epoch": 0.2840823688561551, "flos": 24060795995520.0, "grad_norm": 1.6728910575536384, "language_loss": 0.92433345, "learning_rate": 3.36014833532143e-06, "loss": 0.94613159, "num_input_tokens_seen": 101963160, "step": 4725, "time_per_iteration": 2.653244733810425 }, { "auxiliary_loss_clip": 0.01137335, "auxiliary_loss_mlp": 0.01047317, "balance_loss_clip": 1.05249703, "balance_loss_mlp": 1.02951932, "epoch": 0.2841424921088231, "flos": 29459821368960.0, "grad_norm": 1.5774329387244128, "language_loss": 0.88881439, "learning_rate": 3.3598627783049e-06, "loss": 0.91066098, "num_input_tokens_seen": 101984300, "step": 4726, "time_per_iteration": 2.6815872192382812 }, { "auxiliary_loss_clip": 0.01132666, "auxiliary_loss_mlp": 0.01049768, "balance_loss_clip": 1.05290008, "balance_loss_mlp": 1.03223181, "epoch": 0.28420261536149105, "flos": 48100367053440.0, "grad_norm": 2.008368257744288, "language_loss": 0.78913373, "learning_rate": 3.359577169722238e-06, "loss": 0.81095803, "num_input_tokens_seen": 102005765, "step": 4727, "time_per_iteration": 2.8668875694274902 }, { "auxiliary_loss_clip": 0.01134036, "auxiliary_loss_mlp": 0.01041813, "balance_loss_clip": 1.05225933, "balance_loss_mlp": 1.02603006, "epoch": 0.284262738614159, "flos": 25666146541440.0, "grad_norm": 2.1196929739552433, "language_loss": 0.66590458, "learning_rate": 3.3592915095842733e-06, "loss": 0.68766308, "num_input_tokens_seen": 102022755, "step": 4728, "time_per_iteration": 2.6871252059936523 }, { "auxiliary_loss_clip": 0.01111522, "auxiliary_loss_mlp": 0.01054966, "balance_loss_clip": 1.04948676, "balance_loss_mlp": 1.03766847, "epoch": 0.284322861866827, "flos": 19718980646400.0, "grad_norm": 1.7247901443745783, "language_loss": 0.76369143, "learning_rate": 3.3590057979018386e-06, "loss": 0.78535628, "num_input_tokens_seen": 102041850, "step": 4729, "time_per_iteration": 2.671739339828491 }, { "auxiliary_loss_clip": 0.01121198, "auxiliary_loss_mlp": 0.01054506, "balance_loss_clip": 1.05166233, "balance_loss_mlp": 1.03707767, "epoch": 0.28438298511949495, "flos": 23915250086400.0, "grad_norm": 1.8284571123244682, "language_loss": 0.67062581, "learning_rate": 3.3587200346857674e-06, "loss": 0.69238287, "num_input_tokens_seen": 102059500, "step": 4730, "time_per_iteration": 2.6957883834838867 }, { "auxiliary_loss_clip": 0.01120949, "auxiliary_loss_mlp": 0.01040777, "balance_loss_clip": 1.05008078, "balance_loss_mlp": 1.02283621, "epoch": 0.2844431083721629, "flos": 26067340523520.0, "grad_norm": 1.8142087038783352, "language_loss": 0.7456513, "learning_rate": 3.3584342199468965e-06, "loss": 0.76726854, "num_input_tokens_seen": 102080460, "step": 4731, "time_per_iteration": 2.7621212005615234 }, { "auxiliary_loss_clip": 0.01100065, "auxiliary_loss_mlp": 0.0104061, "balance_loss_clip": 1.04959893, "balance_loss_mlp": 1.02338386, "epoch": 0.2845032316248309, "flos": 25810435474560.0, "grad_norm": 1.4533231430590194, "language_loss": 0.83672202, "learning_rate": 3.3581483536960638e-06, "loss": 0.85812879, "num_input_tokens_seen": 102100950, "step": 4732, "time_per_iteration": 2.807701587677002 }, { "auxiliary_loss_clip": 0.01135958, "auxiliary_loss_mlp": 0.01049006, "balance_loss_clip": 1.05248308, "balance_loss_mlp": 1.03040957, "epoch": 0.2845633548774989, "flos": 19823192979840.0, "grad_norm": 2.88493918484894, "language_loss": 0.78892827, "learning_rate": 3.357862435944109e-06, "loss": 0.8107779, "num_input_tokens_seen": 102119345, "step": 4733, "time_per_iteration": 2.66524076461792 }, { "auxiliary_loss_clip": 0.01153472, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.05533004, "balance_loss_mlp": 1.02984452, "epoch": 0.28462347813016686, "flos": 23182815859200.0, "grad_norm": 2.2364375024988776, "language_loss": 0.71791029, "learning_rate": 3.357576466701875e-06, "loss": 0.73992205, "num_input_tokens_seen": 102139050, "step": 4734, "time_per_iteration": 2.6941637992858887 }, { "auxiliary_loss_clip": 0.01125779, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.05455363, "balance_loss_mlp": 1.01766825, "epoch": 0.2846836013828348, "flos": 18660477732480.0, "grad_norm": 1.8491255089189595, "language_loss": 0.73942113, "learning_rate": 3.3572904459802056e-06, "loss": 0.76103032, "num_input_tokens_seen": 102157935, "step": 4735, "time_per_iteration": 2.736027956008911 }, { "auxiliary_loss_clip": 0.01124029, "auxiliary_loss_mlp": 0.01048016, "balance_loss_clip": 1.05248201, "balance_loss_mlp": 1.03177929, "epoch": 0.2847437246355028, "flos": 14173511523840.0, "grad_norm": 1.7217440703764713, "language_loss": 0.79690897, "learning_rate": 3.357004373789946e-06, "loss": 0.81862932, "num_input_tokens_seen": 102175325, "step": 4736, "time_per_iteration": 2.7069075107574463 }, { "auxiliary_loss_clip": 0.01152237, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.0569663, "balance_loss_mlp": 1.03019249, "epoch": 0.28480384788817076, "flos": 29278364837760.0, "grad_norm": 2.5331890881723327, "language_loss": 0.59956342, "learning_rate": 3.3567182501419453e-06, "loss": 0.62157094, "num_input_tokens_seen": 102196625, "step": 4737, "time_per_iteration": 2.718904972076416 }, { "auxiliary_loss_clip": 0.01131951, "auxiliary_loss_mlp": 0.0104121, "balance_loss_clip": 1.05099404, "balance_loss_mlp": 1.02437758, "epoch": 0.2848639711408387, "flos": 22601314581120.0, "grad_norm": 1.8696274848062555, "language_loss": 0.86556888, "learning_rate": 3.356432075047052e-06, "loss": 0.88730049, "num_input_tokens_seen": 102214975, "step": 4738, "time_per_iteration": 2.719223976135254 }, { "auxiliary_loss_clip": 0.01127313, "auxiliary_loss_mlp": 0.01051123, "balance_loss_clip": 1.05986989, "balance_loss_mlp": 1.03207278, "epoch": 0.2849240943935067, "flos": 17599460866560.0, "grad_norm": 2.688438536338364, "language_loss": 0.90028232, "learning_rate": 3.356145848516118e-06, "loss": 0.92206669, "num_input_tokens_seen": 102231885, "step": 4739, "time_per_iteration": 2.674363851547241 }, { "auxiliary_loss_clip": 0.01136036, "auxiliary_loss_mlp": 0.01044124, "balance_loss_clip": 1.05522013, "balance_loss_mlp": 1.02627802, "epoch": 0.28498421764617465, "flos": 24862573428480.0, "grad_norm": 1.41783833400805, "language_loss": 0.7216897, "learning_rate": 3.355859570559998e-06, "loss": 0.74349129, "num_input_tokens_seen": 102252725, "step": 4740, "time_per_iteration": 2.688591957092285 }, { "auxiliary_loss_clip": 0.01130927, "auxiliary_loss_mlp": 0.010392, "balance_loss_clip": 1.05868936, "balance_loss_mlp": 1.02229571, "epoch": 0.2850443408988426, "flos": 22782555630720.0, "grad_norm": 3.325446081949271, "language_loss": 0.77782756, "learning_rate": 3.3555732411895477e-06, "loss": 0.79952878, "num_input_tokens_seen": 102271730, "step": 4741, "time_per_iteration": 2.6747119426727295 }, { "auxiliary_loss_clip": 0.01107503, "auxiliary_loss_mlp": 0.01048819, "balance_loss_clip": 1.04771924, "balance_loss_mlp": 1.03065109, "epoch": 0.2851044641515106, "flos": 18844053166080.0, "grad_norm": 1.6557809034578879, "language_loss": 0.75952959, "learning_rate": 3.3552868604156235e-06, "loss": 0.78109288, "num_input_tokens_seen": 102291325, "step": 4742, "time_per_iteration": 2.7584095001220703 }, { "auxiliary_loss_clip": 0.01151989, "auxiliary_loss_mlp": 0.01057399, "balance_loss_clip": 1.05341601, "balance_loss_mlp": 1.03720486, "epoch": 0.28516458740417855, "flos": 18880502492160.0, "grad_norm": 2.0538587827096713, "language_loss": 0.57376975, "learning_rate": 3.355000428249086e-06, "loss": 0.59586358, "num_input_tokens_seen": 102309000, "step": 4743, "time_per_iteration": 2.621572494506836 }, { "auxiliary_loss_clip": 0.01116239, "auxiliary_loss_mlp": 0.01056356, "balance_loss_clip": 1.05067348, "balance_loss_mlp": 1.03747356, "epoch": 0.2852247106568465, "flos": 25299821687040.0, "grad_norm": 1.6259491452975234, "language_loss": 0.74499846, "learning_rate": 3.354713944700797e-06, "loss": 0.76672441, "num_input_tokens_seen": 102329240, "step": 4744, "time_per_iteration": 2.8029959201812744 }, { "auxiliary_loss_clip": 0.01132324, "auxiliary_loss_mlp": 0.01047205, "balance_loss_clip": 1.05420351, "balance_loss_mlp": 1.03014612, "epoch": 0.2852848339095145, "flos": 11655383541120.0, "grad_norm": 2.4725597828733563, "language_loss": 0.77258176, "learning_rate": 3.3544274097816185e-06, "loss": 0.79437709, "num_input_tokens_seen": 102344440, "step": 4745, "time_per_iteration": 2.5961194038391113 }, { "auxiliary_loss_clip": 0.01124474, "auxiliary_loss_mlp": 0.01040571, "balance_loss_clip": 1.05262041, "balance_loss_mlp": 1.02427554, "epoch": 0.2853449571621825, "flos": 12933228856320.0, "grad_norm": 1.9164884333366974, "language_loss": 0.8275286, "learning_rate": 3.3541408235024173e-06, "loss": 0.84917903, "num_input_tokens_seen": 102360985, "step": 4746, "time_per_iteration": 4.211855411529541 }, { "auxiliary_loss_clip": 0.01101779, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.0488627, "balance_loss_mlp": 1.02497482, "epoch": 0.28540508041485046, "flos": 20010575255040.0, "grad_norm": 1.8281951571940926, "language_loss": 0.79537141, "learning_rate": 3.3538541858740604e-06, "loss": 0.81682348, "num_input_tokens_seen": 102380320, "step": 4747, "time_per_iteration": 4.276613712310791 }, { "auxiliary_loss_clip": 0.01046154, "auxiliary_loss_mlp": 0.01017989, "balance_loss_clip": 1.02844512, "balance_loss_mlp": 1.01572371, "epoch": 0.28546520366751843, "flos": 68139349966080.0, "grad_norm": 0.7754147669680839, "language_loss": 0.6049211, "learning_rate": 3.3535674969074173e-06, "loss": 0.62556255, "num_input_tokens_seen": 102439140, "step": 4748, "time_per_iteration": 3.0963478088378906 }, { "auxiliary_loss_clip": 0.01148062, "auxiliary_loss_mlp": 0.01048043, "balance_loss_clip": 1.05367923, "balance_loss_mlp": 1.03001821, "epoch": 0.2855253269201864, "flos": 13251540205440.0, "grad_norm": 2.39914017508816, "language_loss": 0.8061412, "learning_rate": 3.3532807566133592e-06, "loss": 0.82810223, "num_input_tokens_seen": 102450990, "step": 4749, "time_per_iteration": 4.199607610702515 }, { "auxiliary_loss_clip": 0.01135936, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.05160487, "balance_loss_mlp": 1.02788317, "epoch": 0.28558545017285436, "flos": 28620876337920.0, "grad_norm": 1.92101956988616, "language_loss": 0.70763719, "learning_rate": 3.3529939650027587e-06, "loss": 0.72944903, "num_input_tokens_seen": 102471820, "step": 4750, "time_per_iteration": 2.6975722312927246 }, { "auxiliary_loss_clip": 0.01132057, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.05308008, "balance_loss_mlp": 1.02660573, "epoch": 0.2856455734255223, "flos": 34130470752000.0, "grad_norm": 1.619747991653998, "language_loss": 0.81983078, "learning_rate": 3.3527071220864917e-06, "loss": 0.84158808, "num_input_tokens_seen": 102492625, "step": 4751, "time_per_iteration": 2.685194969177246 }, { "auxiliary_loss_clip": 0.01146027, "auxiliary_loss_mlp": 0.01046872, "balance_loss_clip": 1.0541997, "balance_loss_mlp": 1.03009951, "epoch": 0.2857056966781903, "flos": 39786149779200.0, "grad_norm": 2.1857777553010203, "language_loss": 0.80359828, "learning_rate": 3.3524202278754353e-06, "loss": 0.82552731, "num_input_tokens_seen": 102514145, "step": 4752, "time_per_iteration": 4.363154649734497 }, { "auxiliary_loss_clip": 0.01130862, "auxiliary_loss_mlp": 0.010456, "balance_loss_clip": 1.04920304, "balance_loss_mlp": 1.02675319, "epoch": 0.28576581993085826, "flos": 21872292145920.0, "grad_norm": 2.612706759191024, "language_loss": 0.78674287, "learning_rate": 3.3521332823804676e-06, "loss": 0.8085075, "num_input_tokens_seen": 102532365, "step": 4753, "time_per_iteration": 2.6128499507904053 }, { "auxiliary_loss_clip": 0.0114991, "auxiliary_loss_mlp": 0.01051658, "balance_loss_clip": 1.05356765, "balance_loss_mlp": 1.03166628, "epoch": 0.2858259431835262, "flos": 19091656592640.0, "grad_norm": 3.5161743537336596, "language_loss": 0.8947711, "learning_rate": 3.3518462856124704e-06, "loss": 0.91678679, "num_input_tokens_seen": 102548425, "step": 4754, "time_per_iteration": 2.5410687923431396 }, { "auxiliary_loss_clip": 0.01130155, "auxiliary_loss_mlp": 0.010468, "balance_loss_clip": 1.05048347, "balance_loss_mlp": 1.03026593, "epoch": 0.2858860664361942, "flos": 20334309557760.0, "grad_norm": 2.3617926288322724, "language_loss": 0.82039523, "learning_rate": 3.3515592375823267e-06, "loss": 0.84216481, "num_input_tokens_seen": 102566370, "step": 4755, "time_per_iteration": 2.6514527797698975 }, { "auxiliary_loss_clip": 0.01098878, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.04732597, "balance_loss_mlp": 1.03233767, "epoch": 0.28594618968886215, "flos": 24461738582400.0, "grad_norm": 1.6385978416895255, "language_loss": 0.83764589, "learning_rate": 3.351272138300922e-06, "loss": 0.8591305, "num_input_tokens_seen": 102588715, "step": 4756, "time_per_iteration": 2.7975916862487793 }, { "auxiliary_loss_clip": 0.01023363, "auxiliary_loss_mlp": 0.01007772, "balance_loss_clip": 1.01913142, "balance_loss_mlp": 1.00524473, "epoch": 0.2860063129415301, "flos": 71652850709760.0, "grad_norm": 0.8721113874523594, "language_loss": 0.6097033, "learning_rate": 3.350984987779142e-06, "loss": 0.63001466, "num_input_tokens_seen": 102656715, "step": 4757, "time_per_iteration": 3.406625986099243 }, { "auxiliary_loss_clip": 0.01147819, "auxiliary_loss_mlp": 0.01038916, "balance_loss_clip": 1.05585599, "balance_loss_mlp": 1.021595, "epoch": 0.2860664361941981, "flos": 20558679863040.0, "grad_norm": 2.030913944398288, "language_loss": 0.66206789, "learning_rate": 3.3506977860278756e-06, "loss": 0.68393528, "num_input_tokens_seen": 102676545, "step": 4758, "time_per_iteration": 2.589768648147583 }, { "auxiliary_loss_clip": 0.01133475, "auxiliary_loss_mlp": 0.01042694, "balance_loss_clip": 1.04988813, "balance_loss_mlp": 1.02581418, "epoch": 0.2861265594468661, "flos": 35996389534080.0, "grad_norm": 2.019963236438103, "language_loss": 0.63374877, "learning_rate": 3.3504105330580143e-06, "loss": 0.65551043, "num_input_tokens_seen": 102702875, "step": 4759, "time_per_iteration": 2.809325695037842 }, { "auxiliary_loss_clip": 0.01129183, "auxiliary_loss_mlp": 0.00777076, "balance_loss_clip": 1.04924989, "balance_loss_mlp": 1.00088644, "epoch": 0.28618668269953407, "flos": 20047419630720.0, "grad_norm": 1.9693348774443893, "language_loss": 0.74033993, "learning_rate": 3.3501232288804496e-06, "loss": 0.75940251, "num_input_tokens_seen": 102723160, "step": 4760, "time_per_iteration": 2.6797397136688232 }, { "auxiliary_loss_clip": 0.01124387, "auxiliary_loss_mlp": 0.01045022, "balance_loss_clip": 1.05517232, "balance_loss_mlp": 1.02849925, "epoch": 0.28624680595220203, "flos": 24971849579520.0, "grad_norm": 2.574168946313644, "language_loss": 0.72227889, "learning_rate": 3.3498358735060773e-06, "loss": 0.74397296, "num_input_tokens_seen": 102743855, "step": 4761, "time_per_iteration": 2.672394275665283 }, { "auxiliary_loss_clip": 0.01079005, "auxiliary_loss_mlp": 0.01049385, "balance_loss_clip": 1.04688287, "balance_loss_mlp": 1.03218305, "epoch": 0.28630692920487, "flos": 22492253911680.0, "grad_norm": 2.095293128310336, "language_loss": 0.74758703, "learning_rate": 3.349548466945793e-06, "loss": 0.76887095, "num_input_tokens_seen": 102761370, "step": 4762, "time_per_iteration": 2.8573946952819824 }, { "auxiliary_loss_clip": 0.01108257, "auxiliary_loss_mlp": 0.01044255, "balance_loss_clip": 1.05117726, "balance_loss_mlp": 1.02725577, "epoch": 0.28636705245753796, "flos": 21249888255360.0, "grad_norm": 1.4714690500952254, "language_loss": 0.76185489, "learning_rate": 3.349261009210496e-06, "loss": 0.78338003, "num_input_tokens_seen": 102780885, "step": 4763, "time_per_iteration": 2.7058494091033936 }, { "auxiliary_loss_clip": 0.01103052, "auxiliary_loss_mlp": 0.01041715, "balance_loss_clip": 1.0442332, "balance_loss_mlp": 1.0234046, "epoch": 0.28642717571020593, "flos": 24095772864000.0, "grad_norm": 2.250941696220621, "language_loss": 0.77264833, "learning_rate": 3.348973500311086e-06, "loss": 0.79409599, "num_input_tokens_seen": 102801000, "step": 4764, "time_per_iteration": 2.7363107204437256 }, { "auxiliary_loss_clip": 0.0111141, "auxiliary_loss_mlp": 0.01044325, "balance_loss_clip": 1.04883742, "balance_loss_mlp": 1.02520347, "epoch": 0.2864872989628739, "flos": 22601386408320.0, "grad_norm": 3.808468667851145, "language_loss": 0.71222258, "learning_rate": 3.348685940258466e-06, "loss": 0.73377991, "num_input_tokens_seen": 102820230, "step": 4765, "time_per_iteration": 2.7225682735443115 }, { "auxiliary_loss_clip": 0.01127531, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.0501802, "balance_loss_mlp": 1.02118707, "epoch": 0.28654742221554186, "flos": 32745073138560.0, "grad_norm": 1.6284115173108313, "language_loss": 0.76206756, "learning_rate": 3.3483983290635395e-06, "loss": 0.78371924, "num_input_tokens_seen": 102842670, "step": 4766, "time_per_iteration": 2.724776268005371 }, { "auxiliary_loss_clip": 0.01130255, "auxiliary_loss_mlp": 0.01038205, "balance_loss_clip": 1.0502758, "balance_loss_mlp": 1.02133691, "epoch": 0.2866075454682098, "flos": 26981626331520.0, "grad_norm": 1.7313176116986193, "language_loss": 0.77457404, "learning_rate": 3.348110666737214e-06, "loss": 0.79625863, "num_input_tokens_seen": 102864480, "step": 4767, "time_per_iteration": 2.7313742637634277 }, { "auxiliary_loss_clip": 0.0114162, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.05109096, "balance_loss_mlp": 1.02519727, "epoch": 0.2866676687208778, "flos": 23253847004160.0, "grad_norm": 1.7818476838857593, "language_loss": 0.65043855, "learning_rate": 3.3478229532903956e-06, "loss": 0.67228168, "num_input_tokens_seen": 102883740, "step": 4768, "time_per_iteration": 2.6173784732818604 }, { "auxiliary_loss_clip": 0.01123197, "auxiliary_loss_mlp": 0.01041331, "balance_loss_clip": 1.04803848, "balance_loss_mlp": 1.02385533, "epoch": 0.28672779197354575, "flos": 21579727870080.0, "grad_norm": 1.5842392137882455, "language_loss": 0.70497799, "learning_rate": 3.3475351887339967e-06, "loss": 0.7266233, "num_input_tokens_seen": 102902945, "step": 4769, "time_per_iteration": 2.627859115600586 }, { "auxiliary_loss_clip": 0.01078118, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.04276228, "balance_loss_mlp": 1.01722169, "epoch": 0.2867879152262137, "flos": 19865568049920.0, "grad_norm": 1.555057890983365, "language_loss": 0.74735439, "learning_rate": 3.3472473730789288e-06, "loss": 0.76847351, "num_input_tokens_seen": 102922405, "step": 4770, "time_per_iteration": 2.807286262512207 }, { "auxiliary_loss_clip": 0.01094623, "auxiliary_loss_mlp": 0.01041164, "balance_loss_clip": 1.04522562, "balance_loss_mlp": 1.02336657, "epoch": 0.2868480384788817, "flos": 28213325648640.0, "grad_norm": 2.2768786529491427, "language_loss": 0.6760053, "learning_rate": 3.3469595063361045e-06, "loss": 0.6973632, "num_input_tokens_seen": 102938980, "step": 4771, "time_per_iteration": 2.7709410190582275 }, { "auxiliary_loss_clip": 0.01041422, "auxiliary_loss_mlp": 0.01015109, "balance_loss_clip": 1.01907253, "balance_loss_mlp": 1.01243877, "epoch": 0.2869081617315497, "flos": 65424286690560.0, "grad_norm": 0.770068198596698, "language_loss": 0.56874299, "learning_rate": 3.3466715885164414e-06, "loss": 0.58930826, "num_input_tokens_seen": 103000405, "step": 4772, "time_per_iteration": 3.0978245735168457 }, { "auxiliary_loss_clip": 0.01067739, "auxiliary_loss_mlp": 0.0077878, "balance_loss_clip": 1.04115915, "balance_loss_mlp": 1.00089169, "epoch": 0.28696828498421767, "flos": 18660729127680.0, "grad_norm": 2.7874039039613345, "language_loss": 0.82870376, "learning_rate": 3.346383619630856e-06, "loss": 0.84716898, "num_input_tokens_seen": 103017970, "step": 4773, "time_per_iteration": 2.7716143131256104 }, { "auxiliary_loss_clip": 0.0114188, "auxiliary_loss_mlp": 0.01043405, "balance_loss_clip": 1.04776216, "balance_loss_mlp": 1.02553546, "epoch": 0.28702840823688563, "flos": 23659745667840.0, "grad_norm": 11.069053071667042, "language_loss": 0.77580261, "learning_rate": 3.34609559969027e-06, "loss": 0.79765546, "num_input_tokens_seen": 103036385, "step": 4774, "time_per_iteration": 2.604790687561035 }, { "auxiliary_loss_clip": 0.01119567, "auxiliary_loss_mlp": 0.01042061, "balance_loss_clip": 1.04915977, "balance_loss_mlp": 1.02414346, "epoch": 0.2870885314895536, "flos": 13804744544640.0, "grad_norm": 1.9103573283121942, "language_loss": 0.73611873, "learning_rate": 3.3458075287056034e-06, "loss": 0.75773501, "num_input_tokens_seen": 103052170, "step": 4775, "time_per_iteration": 2.6234211921691895 }, { "auxiliary_loss_clip": 0.01133151, "auxiliary_loss_mlp": 0.01045326, "balance_loss_clip": 1.04905081, "balance_loss_mlp": 1.02782607, "epoch": 0.28714865474222157, "flos": 17786771314560.0, "grad_norm": 1.6535491049734306, "language_loss": 0.88343942, "learning_rate": 3.34551940668778e-06, "loss": 0.9052242, "num_input_tokens_seen": 103070510, "step": 4776, "time_per_iteration": 2.6941640377044678 }, { "auxiliary_loss_clip": 0.01132773, "auxiliary_loss_mlp": 0.0104327, "balance_loss_clip": 1.05156159, "balance_loss_mlp": 1.02712941, "epoch": 0.28720877799488953, "flos": 15997486199040.0, "grad_norm": 1.7321020140737395, "language_loss": 0.74257779, "learning_rate": 3.345231233647726e-06, "loss": 0.76433825, "num_input_tokens_seen": 103089590, "step": 4777, "time_per_iteration": 2.645650863647461 }, { "auxiliary_loss_clip": 0.01126691, "auxiliary_loss_mlp": 0.01045293, "balance_loss_clip": 1.05245948, "balance_loss_mlp": 1.02812648, "epoch": 0.2872689012475575, "flos": 20923137210240.0, "grad_norm": 1.9446580110028222, "language_loss": 0.80069196, "learning_rate": 3.3449430095963696e-06, "loss": 0.82241178, "num_input_tokens_seen": 103109080, "step": 4778, "time_per_iteration": 2.7606308460235596 }, { "auxiliary_loss_clip": 0.01123482, "auxiliary_loss_mlp": 0.01044505, "balance_loss_clip": 1.05461526, "balance_loss_mlp": 1.02750611, "epoch": 0.28732902450022546, "flos": 21325121291520.0, "grad_norm": 1.7560492266469991, "language_loss": 0.7396307, "learning_rate": 3.3446547345446386e-06, "loss": 0.76131058, "num_input_tokens_seen": 103127755, "step": 4779, "time_per_iteration": 2.831167221069336 }, { "auxiliary_loss_clip": 0.01122102, "auxiliary_loss_mlp": 0.01043876, "balance_loss_clip": 1.04866719, "balance_loss_mlp": 1.0262928, "epoch": 0.2873891477528934, "flos": 20850382212480.0, "grad_norm": 1.5882306223862566, "language_loss": 0.76327771, "learning_rate": 3.3443664085034656e-06, "loss": 0.7849375, "num_input_tokens_seen": 103147035, "step": 4780, "time_per_iteration": 2.6548538208007812 }, { "auxiliary_loss_clip": 0.01102465, "auxiliary_loss_mlp": 0.01042038, "balance_loss_clip": 1.04413557, "balance_loss_mlp": 1.02517641, "epoch": 0.2874492710055614, "flos": 17420051410560.0, "grad_norm": 1.5896497572299877, "language_loss": 0.81445092, "learning_rate": 3.344078031483784e-06, "loss": 0.83589596, "num_input_tokens_seen": 103165410, "step": 4781, "time_per_iteration": 2.6422417163848877 }, { "auxiliary_loss_clip": 0.01109573, "auxiliary_loss_mlp": 0.01045358, "balance_loss_clip": 1.05339658, "balance_loss_mlp": 1.0277034, "epoch": 0.28750939425822936, "flos": 13406818700160.0, "grad_norm": 1.8389370421072637, "language_loss": 0.86738765, "learning_rate": 3.3437896034965283e-06, "loss": 0.888937, "num_input_tokens_seen": 103183710, "step": 4782, "time_per_iteration": 2.7507951259613037 }, { "auxiliary_loss_clip": 0.01113582, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.05343366, "balance_loss_mlp": 1.02604771, "epoch": 0.2875695175108973, "flos": 21870029589120.0, "grad_norm": 1.5283433651606986, "language_loss": 0.71153063, "learning_rate": 3.3435011245526357e-06, "loss": 0.73309994, "num_input_tokens_seen": 103203790, "step": 4783, "time_per_iteration": 2.7166218757629395 }, { "auxiliary_loss_clip": 0.0112343, "auxiliary_loss_mlp": 0.01047879, "balance_loss_clip": 1.05475473, "balance_loss_mlp": 1.030761, "epoch": 0.2876296407635653, "flos": 26245457089920.0, "grad_norm": 1.6861942701171202, "language_loss": 0.76872855, "learning_rate": 3.343212594663047e-06, "loss": 0.79044163, "num_input_tokens_seen": 103223925, "step": 4784, "time_per_iteration": 2.693665027618408 }, { "auxiliary_loss_clip": 0.01095423, "auxiliary_loss_mlp": 0.01053931, "balance_loss_clip": 1.04587293, "balance_loss_mlp": 1.03514349, "epoch": 0.28768976401623325, "flos": 25373654092800.0, "grad_norm": 4.596098798847224, "language_loss": 0.75646108, "learning_rate": 3.3429240138387015e-06, "loss": 0.77795458, "num_input_tokens_seen": 103244760, "step": 4785, "time_per_iteration": 4.380687236785889 }, { "auxiliary_loss_clip": 0.01144615, "auxiliary_loss_mlp": 0.01048905, "balance_loss_clip": 1.0532378, "balance_loss_mlp": 1.03213263, "epoch": 0.28774988726890127, "flos": 30664372982400.0, "grad_norm": 2.434913324661012, "language_loss": 0.83660555, "learning_rate": 3.3426353820905425e-06, "loss": 0.85854077, "num_input_tokens_seen": 103261995, "step": 4786, "time_per_iteration": 4.138700723648071 }, { "auxiliary_loss_clip": 0.01113505, "auxiliary_loss_mlp": 0.0077478, "balance_loss_clip": 1.05201936, "balance_loss_mlp": 1.00095487, "epoch": 0.28781001052156924, "flos": 20595452411520.0, "grad_norm": 1.8737605513707083, "language_loss": 0.80388975, "learning_rate": 3.342346699429516e-06, "loss": 0.82277262, "num_input_tokens_seen": 103279780, "step": 4787, "time_per_iteration": 2.7030651569366455 }, { "auxiliary_loss_clip": 0.01120528, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.0489651, "balance_loss_mlp": 1.02212751, "epoch": 0.2878701337742372, "flos": 26542330997760.0, "grad_norm": 1.8370986188087255, "language_loss": 0.83052301, "learning_rate": 3.3420579658665677e-06, "loss": 0.85212183, "num_input_tokens_seen": 103300580, "step": 4788, "time_per_iteration": 2.7650442123413086 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01044904, "balance_loss_clip": 1.0567044, "balance_loss_mlp": 1.0279882, "epoch": 0.28793025702690517, "flos": 28146855530880.0, "grad_norm": 7.859878454786593, "language_loss": 0.73045379, "learning_rate": 3.3417691814126468e-06, "loss": 0.75201148, "num_input_tokens_seen": 103320430, "step": 4789, "time_per_iteration": 4.340694189071655 }, { "auxiliary_loss_clip": 0.01123471, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.04852343, "balance_loss_mlp": 1.02599669, "epoch": 0.28799038027957313, "flos": 23805471144960.0, "grad_norm": 1.7615007973154742, "language_loss": 0.84425223, "learning_rate": 3.341480346078704e-06, "loss": 0.86591256, "num_input_tokens_seen": 103337695, "step": 4790, "time_per_iteration": 2.6953821182250977 }, { "auxiliary_loss_clip": 0.01136004, "auxiliary_loss_mlp": 0.01049022, "balance_loss_clip": 1.05240703, "balance_loss_mlp": 1.03145027, "epoch": 0.2880505035322411, "flos": 22344122223360.0, "grad_norm": 1.743209341690147, "language_loss": 0.78031182, "learning_rate": 3.3411914598756922e-06, "loss": 0.80216199, "num_input_tokens_seen": 103357010, "step": 4791, "time_per_iteration": 4.299259424209595 }, { "auxiliary_loss_clip": 0.01120123, "auxiliary_loss_mlp": 0.01036962, "balance_loss_clip": 1.05015528, "balance_loss_mlp": 1.01999843, "epoch": 0.28811062678490906, "flos": 18004246208640.0, "grad_norm": 2.2148694233914474, "language_loss": 0.70164073, "learning_rate": 3.3409025228145654e-06, "loss": 0.72321159, "num_input_tokens_seen": 103375600, "step": 4792, "time_per_iteration": 2.646732807159424 }, { "auxiliary_loss_clip": 0.01107079, "auxiliary_loss_mlp": 0.01037734, "balance_loss_clip": 1.05645919, "balance_loss_mlp": 1.02149773, "epoch": 0.28817075003757703, "flos": 22090880361600.0, "grad_norm": 1.9192442052106609, "language_loss": 0.79200894, "learning_rate": 3.3406135349062812e-06, "loss": 0.81345713, "num_input_tokens_seen": 103395225, "step": 4793, "time_per_iteration": 2.765010356903076 }, { "auxiliary_loss_clip": 0.01117839, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.05114603, "balance_loss_mlp": 1.02235532, "epoch": 0.288230873290245, "flos": 41683130847360.0, "grad_norm": 1.7689864288971164, "language_loss": 0.78136635, "learning_rate": 3.340324496161797e-06, "loss": 0.80292487, "num_input_tokens_seen": 103417245, "step": 4794, "time_per_iteration": 2.868473529815674 }, { "auxiliary_loss_clip": 0.01134193, "auxiliary_loss_mlp": 0.0104583, "balance_loss_clip": 1.05259347, "balance_loss_mlp": 1.02856886, "epoch": 0.28829099654291296, "flos": 18624423456000.0, "grad_norm": 2.1692523829597063, "language_loss": 0.8320052, "learning_rate": 3.340035406592074e-06, "loss": 0.85380542, "num_input_tokens_seen": 103435500, "step": 4795, "time_per_iteration": 2.6216471195220947 }, { "auxiliary_loss_clip": 0.01126764, "auxiliary_loss_mlp": 0.01043565, "balance_loss_clip": 1.05043364, "balance_loss_mlp": 1.0279845, "epoch": 0.2883511197955809, "flos": 24674832017280.0, "grad_norm": 2.290853867887048, "language_loss": 0.74744678, "learning_rate": 3.339746266208074e-06, "loss": 0.76915002, "num_input_tokens_seen": 103451040, "step": 4796, "time_per_iteration": 2.6819822788238525 }, { "auxiliary_loss_clip": 0.01136938, "auxiliary_loss_mlp": 0.01040822, "balance_loss_clip": 1.05140758, "balance_loss_mlp": 1.02221298, "epoch": 0.2884112430482489, "flos": 23112143850240.0, "grad_norm": 1.9890524806298786, "language_loss": 0.73144913, "learning_rate": 3.3394570750207614e-06, "loss": 0.7532267, "num_input_tokens_seen": 103471330, "step": 4797, "time_per_iteration": 2.666097640991211 }, { "auxiliary_loss_clip": 0.01104454, "auxiliary_loss_mlp": 0.00775335, "balance_loss_clip": 1.04594803, "balance_loss_mlp": 1.00097072, "epoch": 0.28847136630091685, "flos": 16873347432960.0, "grad_norm": 1.9324008515617646, "language_loss": 0.74650872, "learning_rate": 3.3391678330411017e-06, "loss": 0.76530659, "num_input_tokens_seen": 103488060, "step": 4798, "time_per_iteration": 2.7281830310821533 }, { "auxiliary_loss_clip": 0.0113412, "auxiliary_loss_mlp": 0.01043523, "balance_loss_clip": 1.04996431, "balance_loss_mlp": 1.02463984, "epoch": 0.2885314895535849, "flos": 25657527277440.0, "grad_norm": 3.037553219769834, "language_loss": 0.66004431, "learning_rate": 3.3388785402800642e-06, "loss": 0.68182075, "num_input_tokens_seen": 103503600, "step": 4799, "time_per_iteration": 2.6416096687316895 }, { "auxiliary_loss_clip": 0.01144575, "auxiliary_loss_mlp": 0.01049843, "balance_loss_clip": 1.05205584, "balance_loss_mlp": 1.03268862, "epoch": 0.28859161280625284, "flos": 21107251347840.0, "grad_norm": 1.7946911133370596, "language_loss": 0.8231616, "learning_rate": 3.3385891967486178e-06, "loss": 0.84510577, "num_input_tokens_seen": 103524195, "step": 4800, "time_per_iteration": 2.704357624053955 }, { "auxiliary_loss_clip": 0.01105166, "auxiliary_loss_mlp": 0.01040519, "balance_loss_clip": 1.04861474, "balance_loss_mlp": 1.02392507, "epoch": 0.2886517360589208, "flos": 26469540086400.0, "grad_norm": 1.5930665564066124, "language_loss": 0.9080106, "learning_rate": 3.3382998024577347e-06, "loss": 0.92946744, "num_input_tokens_seen": 103545235, "step": 4801, "time_per_iteration": 2.8163902759552 }, { "auxiliary_loss_clip": 0.01119221, "auxiliary_loss_mlp": 0.00775037, "balance_loss_clip": 1.05178905, "balance_loss_mlp": 1.0008862, "epoch": 0.28871185931158877, "flos": 25265275781760.0, "grad_norm": 2.098995863955026, "language_loss": 0.74342406, "learning_rate": 3.33801035741839e-06, "loss": 0.76236671, "num_input_tokens_seen": 103563305, "step": 4802, "time_per_iteration": 2.8244271278381348 }, { "auxiliary_loss_clip": 0.01029511, "auxiliary_loss_mlp": 0.01004263, "balance_loss_clip": 1.02472734, "balance_loss_mlp": 1.00193822, "epoch": 0.28877198256425674, "flos": 66665431284480.0, "grad_norm": 0.7780596068321518, "language_loss": 0.62987334, "learning_rate": 3.337720861641558e-06, "loss": 0.65021104, "num_input_tokens_seen": 103625025, "step": 4803, "time_per_iteration": 3.299269676208496 }, { "auxiliary_loss_clip": 0.01083739, "auxiliary_loss_mlp": 0.01051002, "balance_loss_clip": 1.03981495, "balance_loss_mlp": 1.03369915, "epoch": 0.2888321058169247, "flos": 20303031790080.0, "grad_norm": 1.8528386679599225, "language_loss": 0.71095157, "learning_rate": 3.3374313151382165e-06, "loss": 0.73229897, "num_input_tokens_seen": 103644235, "step": 4804, "time_per_iteration": 2.762883424758911 }, { "auxiliary_loss_clip": 0.01135071, "auxiliary_loss_mlp": 0.01047534, "balance_loss_clip": 1.05108273, "balance_loss_mlp": 1.0289135, "epoch": 0.28889222906959267, "flos": 25516721963520.0, "grad_norm": 1.926588918304246, "language_loss": 0.67916834, "learning_rate": 3.337141717919346e-06, "loss": 0.70099443, "num_input_tokens_seen": 103664700, "step": 4805, "time_per_iteration": 2.6848111152648926 }, { "auxiliary_loss_clip": 0.01135111, "auxiliary_loss_mlp": 0.01046638, "balance_loss_clip": 1.05359602, "balance_loss_mlp": 1.03029394, "epoch": 0.28895235232226063, "flos": 32671312560000.0, "grad_norm": 1.4381182508216341, "language_loss": 0.69720542, "learning_rate": 3.3368520699959272e-06, "loss": 0.71902293, "num_input_tokens_seen": 103686595, "step": 4806, "time_per_iteration": 2.762458562850952 }, { "auxiliary_loss_clip": 0.01120642, "auxiliary_loss_mlp": 0.01052311, "balance_loss_clip": 1.05073118, "balance_loss_mlp": 1.03559768, "epoch": 0.2890124755749286, "flos": 29714679342720.0, "grad_norm": 1.4600495853323927, "language_loss": 0.71255589, "learning_rate": 3.3365623713789443e-06, "loss": 0.73428547, "num_input_tokens_seen": 103707525, "step": 4807, "time_per_iteration": 2.740931987762451 }, { "auxiliary_loss_clip": 0.01106054, "auxiliary_loss_mlp": 0.01043407, "balance_loss_clip": 1.05087459, "balance_loss_mlp": 1.02625299, "epoch": 0.28907259882759656, "flos": 22674464628480.0, "grad_norm": 1.6111027163793539, "language_loss": 0.81489629, "learning_rate": 3.336272622079382e-06, "loss": 0.83639085, "num_input_tokens_seen": 103727905, "step": 4808, "time_per_iteration": 2.722787380218506 }, { "auxiliary_loss_clip": 0.01098162, "auxiliary_loss_mlp": 0.01048507, "balance_loss_clip": 1.04795146, "balance_loss_mlp": 1.03160298, "epoch": 0.2891327220802645, "flos": 22566050403840.0, "grad_norm": 1.7874609682529725, "language_loss": 0.78304112, "learning_rate": 3.3359828221082276e-06, "loss": 0.80450785, "num_input_tokens_seen": 103748335, "step": 4809, "time_per_iteration": 2.742063522338867 }, { "auxiliary_loss_clip": 0.01091743, "auxiliary_loss_mlp": 0.01047553, "balance_loss_clip": 1.04519784, "balance_loss_mlp": 1.02924204, "epoch": 0.2891928453329325, "flos": 21652806090240.0, "grad_norm": 1.7709564567634208, "language_loss": 0.78864932, "learning_rate": 3.3356929714764714e-06, "loss": 0.81004226, "num_input_tokens_seen": 103767020, "step": 4810, "time_per_iteration": 2.7578415870666504 }, { "auxiliary_loss_clip": 0.01090252, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.04552603, "balance_loss_mlp": 1.02280235, "epoch": 0.28925296858560046, "flos": 23222102359680.0, "grad_norm": 1.6298276151024105, "language_loss": 0.76974982, "learning_rate": 3.3354030701951032e-06, "loss": 0.79104245, "num_input_tokens_seen": 103786355, "step": 4811, "time_per_iteration": 2.7336831092834473 }, { "auxiliary_loss_clip": 0.01132677, "auxiliary_loss_mlp": 0.01047674, "balance_loss_clip": 1.05356216, "balance_loss_mlp": 1.03038859, "epoch": 0.2893130918382685, "flos": 28621666437120.0, "grad_norm": 1.4740946425962824, "language_loss": 0.77044773, "learning_rate": 3.335113118275117e-06, "loss": 0.79225123, "num_input_tokens_seen": 103809345, "step": 4812, "time_per_iteration": 2.745115280151367 }, { "auxiliary_loss_clip": 0.01024348, "auxiliary_loss_mlp": 0.01009076, "balance_loss_clip": 1.02794337, "balance_loss_mlp": 1.00728762, "epoch": 0.28937321509093644, "flos": 72301288982400.0, "grad_norm": 0.8337141037006477, "language_loss": 0.60292435, "learning_rate": 3.3348231157275085e-06, "loss": 0.62325859, "num_input_tokens_seen": 103871180, "step": 4813, "time_per_iteration": 3.3592262268066406 }, { "auxiliary_loss_clip": 0.01094544, "auxiliary_loss_mlp": 0.01044805, "balance_loss_clip": 1.0431211, "balance_loss_mlp": 1.02734065, "epoch": 0.2894333383436044, "flos": 16216397637120.0, "grad_norm": 3.1340543474440623, "language_loss": 0.82301223, "learning_rate": 3.3345330625632725e-06, "loss": 0.84440577, "num_input_tokens_seen": 103889040, "step": 4814, "time_per_iteration": 2.7069244384765625 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.01052591, "balance_loss_clip": 1.05051374, "balance_loss_mlp": 1.03556752, "epoch": 0.2894934615962724, "flos": 24828278918400.0, "grad_norm": 1.6672038490985601, "language_loss": 0.73249441, "learning_rate": 3.3342429587934094e-06, "loss": 0.75403512, "num_input_tokens_seen": 103910380, "step": 4815, "time_per_iteration": 2.764214515686035 }, { "auxiliary_loss_clip": 0.01131126, "auxiliary_loss_mlp": 0.01045124, "balance_loss_clip": 1.05259883, "balance_loss_mlp": 1.02997231, "epoch": 0.28955358484894034, "flos": 20449978329600.0, "grad_norm": 1.9821106518618066, "language_loss": 0.70783043, "learning_rate": 3.3339528044289198e-06, "loss": 0.72959292, "num_input_tokens_seen": 103929955, "step": 4816, "time_per_iteration": 2.7809629440307617 }, { "auxiliary_loss_clip": 0.01119261, "auxiliary_loss_mlp": 0.01048806, "balance_loss_clip": 1.04862189, "balance_loss_mlp": 1.03097248, "epoch": 0.2896137081016083, "flos": 22565188477440.0, "grad_norm": 2.3636227133284122, "language_loss": 0.7445122, "learning_rate": 3.3336625994808055e-06, "loss": 0.76619279, "num_input_tokens_seen": 103948020, "step": 4817, "time_per_iteration": 2.829183578491211 }, { "auxiliary_loss_clip": 0.01108198, "auxiliary_loss_mlp": 0.01054129, "balance_loss_clip": 1.05107522, "balance_loss_mlp": 1.03633142, "epoch": 0.28967383135427627, "flos": 26687948734080.0, "grad_norm": 1.8479613371686012, "language_loss": 0.76190692, "learning_rate": 3.3333723439600723e-06, "loss": 0.78353024, "num_input_tokens_seen": 103968740, "step": 4818, "time_per_iteration": 2.827925443649292 }, { "auxiliary_loss_clip": 0.01074516, "auxiliary_loss_mlp": 0.01041914, "balance_loss_clip": 1.04805899, "balance_loss_mlp": 1.02477193, "epoch": 0.28973395460694423, "flos": 15558262692480.0, "grad_norm": 1.9558897556763024, "language_loss": 0.80060315, "learning_rate": 3.3330820378777263e-06, "loss": 0.82176751, "num_input_tokens_seen": 103986005, "step": 4819, "time_per_iteration": 2.8941574096679688 }, { "auxiliary_loss_clip": 0.01110223, "auxiliary_loss_mlp": 0.01048219, "balance_loss_clip": 1.0494163, "balance_loss_mlp": 1.02931273, "epoch": 0.2897940778596122, "flos": 18697465762560.0, "grad_norm": 1.8074124972104149, "language_loss": 0.78504574, "learning_rate": 3.332791681244776e-06, "loss": 0.80663019, "num_input_tokens_seen": 104005070, "step": 4820, "time_per_iteration": 2.7016515731811523 }, { "auxiliary_loss_clip": 0.01096478, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.04924846, "balance_loss_mlp": 1.02028775, "epoch": 0.28985420111228016, "flos": 18770292587520.0, "grad_norm": 2.105369007151224, "language_loss": 0.72925651, "learning_rate": 3.332501274072231e-06, "loss": 0.7505917, "num_input_tokens_seen": 104022945, "step": 4821, "time_per_iteration": 2.743091583251953 }, { "auxiliary_loss_clip": 0.01132782, "auxiliary_loss_mlp": 0.01040556, "balance_loss_clip": 1.05055594, "balance_loss_mlp": 1.02290142, "epoch": 0.28991432436494813, "flos": 23069840607360.0, "grad_norm": 2.331696646407205, "language_loss": 0.71962738, "learning_rate": 3.332210816371104e-06, "loss": 0.74136078, "num_input_tokens_seen": 104042080, "step": 4822, "time_per_iteration": 2.768996477127075 }, { "auxiliary_loss_clip": 0.01128837, "auxiliary_loss_mlp": 0.01048176, "balance_loss_clip": 1.05237818, "balance_loss_mlp": 1.03142738, "epoch": 0.2899744476176161, "flos": 17603195880960.0, "grad_norm": 1.8111020118629353, "language_loss": 0.662521, "learning_rate": 3.3319203081524102e-06, "loss": 0.68429112, "num_input_tokens_seen": 104060975, "step": 4823, "time_per_iteration": 2.733591318130493 }, { "auxiliary_loss_clip": 0.01107872, "auxiliary_loss_mlp": 0.01042255, "balance_loss_clip": 1.04404497, "balance_loss_mlp": 1.02588761, "epoch": 0.29003457087028406, "flos": 22309360836480.0, "grad_norm": 4.579803152663717, "language_loss": 0.81162238, "learning_rate": 3.331629749427164e-06, "loss": 0.83312368, "num_input_tokens_seen": 104081395, "step": 4824, "time_per_iteration": 4.278540849685669 }, { "auxiliary_loss_clip": 0.01143667, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.05104661, "balance_loss_mlp": 1.025828, "epoch": 0.2900946941229521, "flos": 21944975316480.0, "grad_norm": 2.265114761106369, "language_loss": 0.72592747, "learning_rate": 3.331339140206385e-06, "loss": 0.74780297, "num_input_tokens_seen": 104099995, "step": 4825, "time_per_iteration": 4.177908658981323 }, { "auxiliary_loss_clip": 0.01147795, "auxiliary_loss_mlp": 0.01036998, "balance_loss_clip": 1.05434549, "balance_loss_mlp": 1.01930714, "epoch": 0.29015481737562004, "flos": 17932173569280.0, "grad_norm": 2.216571865047856, "language_loss": 0.73680669, "learning_rate": 3.331048480501092e-06, "loss": 0.75865459, "num_input_tokens_seen": 104118930, "step": 4826, "time_per_iteration": 2.6371700763702393 }, { "auxiliary_loss_clip": 0.0113072, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05073726, "balance_loss_mlp": 1.02483773, "epoch": 0.290214940628288, "flos": 22783525297920.0, "grad_norm": 2.324527624383577, "language_loss": 0.68556225, "learning_rate": 3.3307577703223073e-06, "loss": 0.70727527, "num_input_tokens_seen": 104136940, "step": 4827, "time_per_iteration": 2.6447484493255615 }, { "auxiliary_loss_clip": 0.01125924, "auxiliary_loss_mlp": 0.0104453, "balance_loss_clip": 1.04981911, "balance_loss_mlp": 1.02650571, "epoch": 0.290275063880956, "flos": 20006481104640.0, "grad_norm": 1.8485927197530279, "language_loss": 0.80266023, "learning_rate": 3.3304670096810545e-06, "loss": 0.82436466, "num_input_tokens_seen": 104154280, "step": 4828, "time_per_iteration": 4.131803274154663 }, { "auxiliary_loss_clip": 0.01144317, "auxiliary_loss_mlp": 0.01049939, "balance_loss_clip": 1.05393863, "balance_loss_mlp": 1.03288054, "epoch": 0.29033518713362394, "flos": 22053605022720.0, "grad_norm": 1.8003854621941846, "language_loss": 0.80658895, "learning_rate": 3.33017619858836e-06, "loss": 0.8285315, "num_input_tokens_seen": 104172605, "step": 4829, "time_per_iteration": 2.760899066925049 }, { "auxiliary_loss_clip": 0.011197, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.05093288, "balance_loss_mlp": 1.02680826, "epoch": 0.2903953103862919, "flos": 25630056351360.0, "grad_norm": 1.5734536519128175, "language_loss": 0.82911146, "learning_rate": 3.329885337055249e-06, "loss": 0.85074902, "num_input_tokens_seen": 104194120, "step": 4830, "time_per_iteration": 4.403480529785156 }, { "auxiliary_loss_clip": 0.01137563, "auxiliary_loss_mlp": 0.01048934, "balance_loss_clip": 1.05430257, "balance_loss_mlp": 1.03155351, "epoch": 0.29045543363895987, "flos": 16945851035520.0, "grad_norm": 2.2586543311689486, "language_loss": 0.79236752, "learning_rate": 3.3295944250927546e-06, "loss": 0.81423253, "num_input_tokens_seen": 104210875, "step": 4831, "time_per_iteration": 2.6066412925720215 }, { "auxiliary_loss_clip": 0.01143728, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.05470276, "balance_loss_mlp": 1.03000546, "epoch": 0.29051555689162784, "flos": 26395492199040.0, "grad_norm": 1.9694662738232038, "language_loss": 0.7459774, "learning_rate": 3.3293034627119055e-06, "loss": 0.76787293, "num_input_tokens_seen": 104229875, "step": 4832, "time_per_iteration": 2.8411331176757812 }, { "auxiliary_loss_clip": 0.01122405, "auxiliary_loss_mlp": 0.01037758, "balance_loss_clip": 1.05429769, "balance_loss_mlp": 1.02335787, "epoch": 0.2905756801442958, "flos": 21103875469440.0, "grad_norm": 1.979215737756815, "language_loss": 0.76150024, "learning_rate": 3.329012449923736e-06, "loss": 0.78310186, "num_input_tokens_seen": 104250405, "step": 4833, "time_per_iteration": 2.7510006427764893 }, { "auxiliary_loss_clip": 0.01107016, "auxiliary_loss_mlp": 0.01040024, "balance_loss_clip": 1.04580688, "balance_loss_mlp": 1.02383542, "epoch": 0.29063580339696377, "flos": 15706071158400.0, "grad_norm": 1.7715964188803632, "language_loss": 0.64404124, "learning_rate": 3.3287213867392813e-06, "loss": 0.66551173, "num_input_tokens_seen": 104269185, "step": 4834, "time_per_iteration": 2.6475064754486084 }, { "auxiliary_loss_clip": 0.01117159, "auxiliary_loss_mlp": 0.01032155, "balance_loss_clip": 1.05111325, "balance_loss_mlp": 1.01724815, "epoch": 0.29069592664963173, "flos": 24644990793600.0, "grad_norm": 1.4640588842294755, "language_loss": 0.71717769, "learning_rate": 3.3284302731695783e-06, "loss": 0.73867083, "num_input_tokens_seen": 104289400, "step": 4835, "time_per_iteration": 2.6991324424743652 }, { "auxiliary_loss_clip": 0.01117393, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.04881835, "balance_loss_mlp": 1.02187634, "epoch": 0.2907560499022997, "flos": 24973753000320.0, "grad_norm": 1.657223137158586, "language_loss": 0.79492378, "learning_rate": 3.3281391092256668e-06, "loss": 0.81647086, "num_input_tokens_seen": 104310485, "step": 4836, "time_per_iteration": 2.7060084342956543 }, { "auxiliary_loss_clip": 0.01107347, "auxiliary_loss_mlp": 0.01045193, "balance_loss_clip": 1.05334711, "balance_loss_mlp": 1.02744293, "epoch": 0.29081617315496766, "flos": 18657496903680.0, "grad_norm": 1.9442300400082562, "language_loss": 0.81372344, "learning_rate": 3.3278478949185865e-06, "loss": 0.83524883, "num_input_tokens_seen": 104327330, "step": 4837, "time_per_iteration": 2.640610933303833 }, { "auxiliary_loss_clip": 0.01116355, "auxiliary_loss_mlp": 0.01039398, "balance_loss_clip": 1.04938102, "balance_loss_mlp": 1.0233283, "epoch": 0.2908762964076356, "flos": 35331035955840.0, "grad_norm": 6.209911556378307, "language_loss": 0.67358792, "learning_rate": 3.327556630259381e-06, "loss": 0.69514549, "num_input_tokens_seen": 104350350, "step": 4838, "time_per_iteration": 2.758422374725342 }, { "auxiliary_loss_clip": 0.01147958, "auxiliary_loss_mlp": 0.00775113, "balance_loss_clip": 1.05402315, "balance_loss_mlp": 1.00096607, "epoch": 0.29093641966030365, "flos": 23076305055360.0, "grad_norm": 1.5628414298261506, "language_loss": 0.71139944, "learning_rate": 3.327265315259095e-06, "loss": 0.73063016, "num_input_tokens_seen": 104369995, "step": 4839, "time_per_iteration": 2.683349132537842 }, { "auxiliary_loss_clip": 0.0114095, "auxiliary_loss_mlp": 0.01036937, "balance_loss_clip": 1.04966319, "balance_loss_mlp": 1.02147555, "epoch": 0.2909965429129716, "flos": 35955415094400.0, "grad_norm": 1.9403130873020338, "language_loss": 0.7539593, "learning_rate": 3.326973949928776e-06, "loss": 0.77573812, "num_input_tokens_seen": 104392285, "step": 4840, "time_per_iteration": 2.696808099746704 }, { "auxiliary_loss_clip": 0.01093571, "auxiliary_loss_mlp": 0.01045095, "balance_loss_clip": 1.04470551, "balance_loss_mlp": 1.02825069, "epoch": 0.2910566661656396, "flos": 30880231764480.0, "grad_norm": 1.7841334294021773, "language_loss": 0.60546595, "learning_rate": 3.326682534279471e-06, "loss": 0.62685257, "num_input_tokens_seen": 104412640, "step": 4841, "time_per_iteration": 2.74575138092041 }, { "auxiliary_loss_clip": 0.01120271, "auxiliary_loss_mlp": 0.01039624, "balance_loss_clip": 1.04983509, "balance_loss_mlp": 1.02288651, "epoch": 0.29111678941830754, "flos": 30010188533760.0, "grad_norm": 1.408353605568525, "language_loss": 0.71321762, "learning_rate": 3.326391068322232e-06, "loss": 0.73481655, "num_input_tokens_seen": 104435245, "step": 4842, "time_per_iteration": 2.7568962574005127 }, { "auxiliary_loss_clip": 0.01130885, "auxiliary_loss_mlp": 0.01037088, "balance_loss_clip": 1.05042899, "balance_loss_mlp": 1.02191257, "epoch": 0.2911769126709755, "flos": 22857393617280.0, "grad_norm": 2.1183002067983585, "language_loss": 0.73610562, "learning_rate": 3.3260995520681098e-06, "loss": 0.75778532, "num_input_tokens_seen": 104455395, "step": 4843, "time_per_iteration": 2.6703171730041504 }, { "auxiliary_loss_clip": 0.0108851, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.04775739, "balance_loss_mlp": 1.02058005, "epoch": 0.2912370359236435, "flos": 21650507619840.0, "grad_norm": 4.868884277111801, "language_loss": 0.58445942, "learning_rate": 3.3258079855281602e-06, "loss": 0.60570699, "num_input_tokens_seen": 104473350, "step": 4844, "time_per_iteration": 2.7461965084075928 }, { "auxiliary_loss_clip": 0.01138917, "auxiliary_loss_mlp": 0.01039428, "balance_loss_clip": 1.05586743, "balance_loss_mlp": 1.0222863, "epoch": 0.29129715917631144, "flos": 22893340152960.0, "grad_norm": 1.9200815982611392, "language_loss": 0.86459565, "learning_rate": 3.3255163687134396e-06, "loss": 0.88637912, "num_input_tokens_seen": 104492265, "step": 4845, "time_per_iteration": 2.711101770401001 }, { "auxiliary_loss_clip": 0.01115849, "auxiliary_loss_mlp": 0.01052584, "balance_loss_clip": 1.05018926, "balance_loss_mlp": 1.03505993, "epoch": 0.2913572824289794, "flos": 22674464628480.0, "grad_norm": 1.7226223126663984, "language_loss": 0.67067879, "learning_rate": 3.3252247016350046e-06, "loss": 0.69236308, "num_input_tokens_seen": 104510755, "step": 4846, "time_per_iteration": 2.698076009750366 }, { "auxiliary_loss_clip": 0.01120746, "auxiliary_loss_mlp": 0.01040428, "balance_loss_clip": 1.05198884, "balance_loss_mlp": 1.02457917, "epoch": 0.29141740568164737, "flos": 23107403255040.0, "grad_norm": 1.9884880347168128, "language_loss": 0.70629871, "learning_rate": 3.3249329843039166e-06, "loss": 0.7279104, "num_input_tokens_seen": 104530830, "step": 4847, "time_per_iteration": 2.6693859100341797 }, { "auxiliary_loss_clip": 0.01129385, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.0490911, "balance_loss_mlp": 1.02048314, "epoch": 0.29147752893431533, "flos": 23587026583680.0, "grad_norm": 1.4444788582363046, "language_loss": 0.73975939, "learning_rate": 3.324641216731237e-06, "loss": 0.76141691, "num_input_tokens_seen": 104550115, "step": 4848, "time_per_iteration": 2.779012680053711 }, { "auxiliary_loss_clip": 0.0112526, "auxiliary_loss_mlp": 0.01051811, "balance_loss_clip": 1.04831481, "balance_loss_mlp": 1.03391802, "epoch": 0.2915376521869833, "flos": 20591968792320.0, "grad_norm": 3.067540232947916, "language_loss": 0.76738584, "learning_rate": 3.3243493989280295e-06, "loss": 0.7891565, "num_input_tokens_seen": 104566255, "step": 4849, "time_per_iteration": 2.6103999614715576 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01041862, "balance_loss_clip": 1.04718697, "balance_loss_mlp": 1.02541125, "epoch": 0.29159777543965126, "flos": 20811490761600.0, "grad_norm": 1.7266499063872853, "language_loss": 0.78276592, "learning_rate": 3.3240575309053596e-06, "loss": 0.80442822, "num_input_tokens_seen": 104585235, "step": 4850, "time_per_iteration": 2.6395609378814697 }, { "auxiliary_loss_clip": 0.01111964, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.04907775, "balance_loss_mlp": 1.0209378, "epoch": 0.29165789869231923, "flos": 24244155947520.0, "grad_norm": 1.8024770323318549, "language_loss": 0.7657702, "learning_rate": 3.323765612674296e-06, "loss": 0.78727031, "num_input_tokens_seen": 104605315, "step": 4851, "time_per_iteration": 2.7265985012054443 }, { "auxiliary_loss_clip": 0.01132156, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.052459, "balance_loss_mlp": 1.03083527, "epoch": 0.29171802194498725, "flos": 28949925853440.0, "grad_norm": 1.3639310788782566, "language_loss": 0.77680421, "learning_rate": 3.3234736442459078e-06, "loss": 0.7985822, "num_input_tokens_seen": 104626055, "step": 4852, "time_per_iteration": 2.7161712646484375 }, { "auxiliary_loss_clip": 0.01120344, "auxiliary_loss_mlp": 0.01051407, "balance_loss_clip": 1.05108476, "balance_loss_mlp": 1.03523064, "epoch": 0.2917781451976552, "flos": 22598226011520.0, "grad_norm": 1.6397145219173752, "language_loss": 0.7816534, "learning_rate": 3.3231816256312665e-06, "loss": 0.80337089, "num_input_tokens_seen": 104646005, "step": 4853, "time_per_iteration": 2.748053789138794 }, { "auxiliary_loss_clip": 0.01108012, "auxiliary_loss_mlp": 0.01041349, "balance_loss_clip": 1.04923177, "balance_loss_mlp": 1.02535105, "epoch": 0.2918382684503232, "flos": 21574448570880.0, "grad_norm": 2.273586870261815, "language_loss": 0.8791436, "learning_rate": 3.322889556841445e-06, "loss": 0.90063715, "num_input_tokens_seen": 104661620, "step": 4854, "time_per_iteration": 2.7663791179656982 }, { "auxiliary_loss_clip": 0.01128591, "auxiliary_loss_mlp": 0.01054226, "balance_loss_clip": 1.05255818, "balance_loss_mlp": 1.03502131, "epoch": 0.29189839170299114, "flos": 24353503925760.0, "grad_norm": 1.7143523369489482, "language_loss": 0.86374146, "learning_rate": 3.322597437887519e-06, "loss": 0.88556957, "num_input_tokens_seen": 104681445, "step": 4855, "time_per_iteration": 2.613903284072876 }, { "auxiliary_loss_clip": 0.01039808, "auxiliary_loss_mlp": 0.01005184, "balance_loss_clip": 1.02170599, "balance_loss_mlp": 1.00303864, "epoch": 0.2919585149556591, "flos": 71316726215040.0, "grad_norm": 0.7954079009769616, "language_loss": 0.60148996, "learning_rate": 3.322305268780566e-06, "loss": 0.6219399, "num_input_tokens_seen": 104747945, "step": 4856, "time_per_iteration": 3.273501396179199 }, { "auxiliary_loss_clip": 0.01115701, "auxiliary_loss_mlp": 0.00774991, "balance_loss_clip": 1.04708552, "balance_loss_mlp": 1.00107539, "epoch": 0.2920186382083271, "flos": 15633208419840.0, "grad_norm": 1.7540806356878256, "language_loss": 0.6825304, "learning_rate": 3.322013049531664e-06, "loss": 0.70143735, "num_input_tokens_seen": 104766225, "step": 4857, "time_per_iteration": 2.6799964904785156 }, { "auxiliary_loss_clip": 0.01129839, "auxiliary_loss_mlp": 0.00774071, "balance_loss_clip": 1.05058599, "balance_loss_mlp": 1.00106227, "epoch": 0.29207876146099504, "flos": 28366018364160.0, "grad_norm": 1.9069678720023968, "language_loss": 0.83446503, "learning_rate": 3.321720780151895e-06, "loss": 0.85350412, "num_input_tokens_seen": 104785345, "step": 4858, "time_per_iteration": 2.7004997730255127 }, { "auxiliary_loss_clip": 0.01143419, "auxiliary_loss_mlp": 0.01047414, "balance_loss_clip": 1.05265319, "balance_loss_mlp": 1.03119004, "epoch": 0.292138884713663, "flos": 21870963342720.0, "grad_norm": 1.7162042036272904, "language_loss": 0.77357888, "learning_rate": 3.321428460652342e-06, "loss": 0.79548717, "num_input_tokens_seen": 104804560, "step": 4859, "time_per_iteration": 2.5901620388031006 }, { "auxiliary_loss_clip": 0.01105726, "auxiliary_loss_mlp": 0.01044957, "balance_loss_clip": 1.05237806, "balance_loss_mlp": 1.02816057, "epoch": 0.29219900796633097, "flos": 20992552243200.0, "grad_norm": 2.2554676354860246, "language_loss": 0.68046212, "learning_rate": 3.3211360910440885e-06, "loss": 0.70196903, "num_input_tokens_seen": 104821105, "step": 4860, "time_per_iteration": 2.7831058502197266 }, { "auxiliary_loss_clip": 0.01117304, "auxiliary_loss_mlp": 0.01041096, "balance_loss_clip": 1.05229402, "balance_loss_mlp": 1.02662396, "epoch": 0.29225913121899894, "flos": 35004608133120.0, "grad_norm": 2.539974445673703, "language_loss": 0.75258791, "learning_rate": 3.320843671338222e-06, "loss": 0.77417195, "num_input_tokens_seen": 104841440, "step": 4861, "time_per_iteration": 2.7506070137023926 }, { "auxiliary_loss_clip": 0.01128031, "auxiliary_loss_mlp": 0.0105121, "balance_loss_clip": 1.04845262, "balance_loss_mlp": 1.03620112, "epoch": 0.2923192544716669, "flos": 13515663888000.0, "grad_norm": 3.0942357088370245, "language_loss": 0.91498685, "learning_rate": 3.320551201545832e-06, "loss": 0.93677926, "num_input_tokens_seen": 104858210, "step": 4862, "time_per_iteration": 2.589700937271118 }, { "auxiliary_loss_clip": 0.01131947, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.05090141, "balance_loss_mlp": 1.02786124, "epoch": 0.29237937772433487, "flos": 19463512141440.0, "grad_norm": 2.2124063953391464, "language_loss": 0.73112279, "learning_rate": 3.320258681678008e-06, "loss": 0.75287139, "num_input_tokens_seen": 104875620, "step": 4863, "time_per_iteration": 4.142335653305054 }, { "auxiliary_loss_clip": 0.01061699, "auxiliary_loss_mlp": 0.01044676, "balance_loss_clip": 1.04478168, "balance_loss_mlp": 1.02934611, "epoch": 0.29243950097700283, "flos": 20850597694080.0, "grad_norm": 1.893468710780351, "language_loss": 0.77841508, "learning_rate": 3.319966111745842e-06, "loss": 0.79947883, "num_input_tokens_seen": 104894600, "step": 4864, "time_per_iteration": 4.309613943099976 }, { "auxiliary_loss_clip": 0.01102707, "auxiliary_loss_mlp": 0.01050983, "balance_loss_clip": 1.04593945, "balance_loss_mlp": 1.03424644, "epoch": 0.29249962422967085, "flos": 23584225322880.0, "grad_norm": 1.5703024458168264, "language_loss": 0.81861019, "learning_rate": 3.319673491760429e-06, "loss": 0.84014714, "num_input_tokens_seen": 104914530, "step": 4865, "time_per_iteration": 2.762397527694702 }, { "auxiliary_loss_clip": 0.0109576, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.05265307, "balance_loss_mlp": 1.02924657, "epoch": 0.2925597474823388, "flos": 22273342473600.0, "grad_norm": 2.2072447614425554, "language_loss": 0.85522473, "learning_rate": 3.3193808217328645e-06, "loss": 0.87664878, "num_input_tokens_seen": 104933460, "step": 4866, "time_per_iteration": 2.8033764362335205 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04811919, "balance_loss_mlp": 1.02410054, "epoch": 0.2926198707350068, "flos": 34456108475520.0, "grad_norm": 1.7213351696608077, "language_loss": 0.75498515, "learning_rate": 3.3190881016742476e-06, "loss": 0.7764926, "num_input_tokens_seen": 104954495, "step": 4867, "time_per_iteration": 4.2950732707977295 }, { "auxiliary_loss_clip": 0.01083116, "auxiliary_loss_mlp": 0.01052463, "balance_loss_clip": 1.04825687, "balance_loss_mlp": 1.03576183, "epoch": 0.29267999398767475, "flos": 20704153944960.0, "grad_norm": 1.9203033465249189, "language_loss": 0.73236179, "learning_rate": 3.3187953315956776e-06, "loss": 0.75371754, "num_input_tokens_seen": 104971915, "step": 4868, "time_per_iteration": 2.775538921356201 }, { "auxiliary_loss_clip": 0.01091396, "auxiliary_loss_mlp": 0.01045538, "balance_loss_clip": 1.04888034, "balance_loss_mlp": 1.02836001, "epoch": 0.2927401172403427, "flos": 18368667642240.0, "grad_norm": 1.663889887662616, "language_loss": 0.74540651, "learning_rate": 3.3185025115082566e-06, "loss": 0.76677585, "num_input_tokens_seen": 104991335, "step": 4869, "time_per_iteration": 2.734683036804199 }, { "auxiliary_loss_clip": 0.01116568, "auxiliary_loss_mlp": 0.01040323, "balance_loss_clip": 1.050179, "balance_loss_mlp": 1.02405143, "epoch": 0.2928002404930107, "flos": 26104041244800.0, "grad_norm": 1.5721867242720646, "language_loss": 0.76492888, "learning_rate": 3.318209641423088e-06, "loss": 0.78649783, "num_input_tokens_seen": 105012015, "step": 4870, "time_per_iteration": 4.413575649261475 }, { "auxiliary_loss_clip": 0.01133789, "auxiliary_loss_mlp": 0.0105055, "balance_loss_clip": 1.05237079, "balance_loss_mlp": 1.0328114, "epoch": 0.29286036374567864, "flos": 21324726241920.0, "grad_norm": 2.0174334678237655, "language_loss": 0.6773119, "learning_rate": 3.3179167213512777e-06, "loss": 0.69915527, "num_input_tokens_seen": 105031460, "step": 4871, "time_per_iteration": 2.68796706199646 }, { "auxiliary_loss_clip": 0.01112736, "auxiliary_loss_mlp": 0.01051475, "balance_loss_clip": 1.04638386, "balance_loss_mlp": 1.03515494, "epoch": 0.2929204869983466, "flos": 29569492569600.0, "grad_norm": 4.945083241782643, "language_loss": 0.77463269, "learning_rate": 3.317623751303933e-06, "loss": 0.79627478, "num_input_tokens_seen": 105052965, "step": 4872, "time_per_iteration": 2.7679827213287354 }, { "auxiliary_loss_clip": 0.01078644, "auxiliary_loss_mlp": 0.01045822, "balance_loss_clip": 1.0468123, "balance_loss_mlp": 1.0273211, "epoch": 0.2929806102510146, "flos": 19058259922560.0, "grad_norm": 1.9468785945114855, "language_loss": 0.72814691, "learning_rate": 3.317330731292164e-06, "loss": 0.74939156, "num_input_tokens_seen": 105071840, "step": 4873, "time_per_iteration": 2.8704919815063477 }, { "auxiliary_loss_clip": 0.01135073, "auxiliary_loss_mlp": 0.01044722, "balance_loss_clip": 1.0525651, "balance_loss_mlp": 1.02705503, "epoch": 0.29304073350368254, "flos": 21944221130880.0, "grad_norm": 1.9420707280566882, "language_loss": 0.78093398, "learning_rate": 3.3170376613270812e-06, "loss": 0.80273187, "num_input_tokens_seen": 105089445, "step": 4874, "time_per_iteration": 2.6573073863983154 }, { "auxiliary_loss_clip": 0.01093774, "auxiliary_loss_mlp": 0.01045077, "balance_loss_clip": 1.05151463, "balance_loss_mlp": 1.02790475, "epoch": 0.2931008567563505, "flos": 15450818135040.0, "grad_norm": 1.8901262824755785, "language_loss": 0.77336359, "learning_rate": 3.3167445414197985e-06, "loss": 0.794752, "num_input_tokens_seen": 105106210, "step": 4875, "time_per_iteration": 2.6960959434509277 }, { "auxiliary_loss_clip": 0.01141436, "auxiliary_loss_mlp": 0.01038673, "balance_loss_clip": 1.05718327, "balance_loss_mlp": 1.02218604, "epoch": 0.29316098000901847, "flos": 16983162288000.0, "grad_norm": 1.556341262673854, "language_loss": 0.69037539, "learning_rate": 3.316451371581431e-06, "loss": 0.71217644, "num_input_tokens_seen": 105124200, "step": 4876, "time_per_iteration": 2.6719844341278076 }, { "auxiliary_loss_clip": 0.01121768, "auxiliary_loss_mlp": 0.01047732, "balance_loss_clip": 1.04729414, "balance_loss_mlp": 1.03105509, "epoch": 0.29322110326168643, "flos": 16357705741440.0, "grad_norm": 2.0371531421747466, "language_loss": 0.82111382, "learning_rate": 3.316158151823096e-06, "loss": 0.84280884, "num_input_tokens_seen": 105140400, "step": 4877, "time_per_iteration": 2.632293462753296 }, { "auxiliary_loss_clip": 0.01139233, "auxiliary_loss_mlp": 0.01040634, "balance_loss_clip": 1.05428672, "balance_loss_mlp": 1.02392054, "epoch": 0.29328122651435445, "flos": 13990869843840.0, "grad_norm": 3.614839551588232, "language_loss": 0.67366385, "learning_rate": 3.315864882155911e-06, "loss": 0.69546252, "num_input_tokens_seen": 105157535, "step": 4878, "time_per_iteration": 2.5839362144470215 }, { "auxiliary_loss_clip": 0.01100237, "auxiliary_loss_mlp": 0.01045253, "balance_loss_clip": 1.04628241, "balance_loss_mlp": 1.02817595, "epoch": 0.2933413497670224, "flos": 25264593423360.0, "grad_norm": 2.0985622071445063, "language_loss": 0.73632258, "learning_rate": 3.3155715625909982e-06, "loss": 0.75777751, "num_input_tokens_seen": 105175185, "step": 4879, "time_per_iteration": 2.738429307937622 }, { "auxiliary_loss_clip": 0.01104776, "auxiliary_loss_mlp": 0.00776504, "balance_loss_clip": 1.05266857, "balance_loss_mlp": 1.00116253, "epoch": 0.2934014730196904, "flos": 32123746656000.0, "grad_norm": 1.8172867500477656, "language_loss": 0.66441375, "learning_rate": 3.3152781931394803e-06, "loss": 0.68322659, "num_input_tokens_seen": 105194540, "step": 4880, "time_per_iteration": 2.7889339923858643 }, { "auxiliary_loss_clip": 0.01130875, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.05021453, "balance_loss_mlp": 1.03249359, "epoch": 0.29346159627235835, "flos": 24352498344960.0, "grad_norm": 1.9971358437235982, "language_loss": 0.70130688, "learning_rate": 3.314984773812481e-06, "loss": 0.72310567, "num_input_tokens_seen": 105213215, "step": 4881, "time_per_iteration": 2.705906629562378 }, { "auxiliary_loss_clip": 0.01112418, "auxiliary_loss_mlp": 0.00775734, "balance_loss_clip": 1.04823685, "balance_loss_mlp": 1.00119698, "epoch": 0.2935217195250263, "flos": 22746752749440.0, "grad_norm": 1.8949601379230998, "language_loss": 0.83497417, "learning_rate": 3.314691304621127e-06, "loss": 0.85385573, "num_input_tokens_seen": 105231585, "step": 4882, "time_per_iteration": 2.715853691101074 }, { "auxiliary_loss_clip": 0.01148283, "auxiliary_loss_mlp": 0.01045596, "balance_loss_clip": 1.05350292, "balance_loss_mlp": 1.02825117, "epoch": 0.2935818427776943, "flos": 21725561088000.0, "grad_norm": 2.6750396503443827, "language_loss": 0.71433568, "learning_rate": 3.314397785576548e-06, "loss": 0.73627448, "num_input_tokens_seen": 105250120, "step": 4883, "time_per_iteration": 2.629642963409424 }, { "auxiliary_loss_clip": 0.01123143, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.05262315, "balance_loss_mlp": 1.0230521, "epoch": 0.29364196603036224, "flos": 23804968354560.0, "grad_norm": 2.1262053984109226, "language_loss": 0.92650437, "learning_rate": 3.3141042166898726e-06, "loss": 0.94814324, "num_input_tokens_seen": 105266065, "step": 4884, "time_per_iteration": 2.727379322052002 }, { "auxiliary_loss_clip": 0.01138638, "auxiliary_loss_mlp": 0.01039707, "balance_loss_clip": 1.05512667, "balance_loss_mlp": 1.0232085, "epoch": 0.2937020892830302, "flos": 23470064922240.0, "grad_norm": 2.19754538449792, "language_loss": 0.73535883, "learning_rate": 3.313810597972234e-06, "loss": 0.75714231, "num_input_tokens_seen": 105282155, "step": 4885, "time_per_iteration": 2.706212043762207 }, { "auxiliary_loss_clip": 0.01124089, "auxiliary_loss_mlp": 0.01045234, "balance_loss_clip": 1.04882109, "balance_loss_mlp": 1.02791286, "epoch": 0.2937622125356982, "flos": 24272740195200.0, "grad_norm": 2.8259058407064566, "language_loss": 0.84815478, "learning_rate": 3.3135169294347655e-06, "loss": 0.86984795, "num_input_tokens_seen": 105299225, "step": 4886, "time_per_iteration": 2.651383876800537 }, { "auxiliary_loss_clip": 0.01112051, "auxiliary_loss_mlp": 0.01040147, "balance_loss_clip": 1.04674077, "balance_loss_mlp": 1.023839, "epoch": 0.29382233578836614, "flos": 20662461233280.0, "grad_norm": 2.312079302728887, "language_loss": 0.77030611, "learning_rate": 3.313223211088603e-06, "loss": 0.7918281, "num_input_tokens_seen": 105315710, "step": 4887, "time_per_iteration": 2.8299317359924316 }, { "auxiliary_loss_clip": 0.01121167, "auxiliary_loss_mlp": 0.01044419, "balance_loss_clip": 1.05137563, "balance_loss_mlp": 1.02809978, "epoch": 0.2938824590410341, "flos": 16545052103040.0, "grad_norm": 4.814706857660641, "language_loss": 0.79822707, "learning_rate": 3.3129294429448855e-06, "loss": 0.81988299, "num_input_tokens_seen": 105333505, "step": 4888, "time_per_iteration": 2.6942543983459473 }, { "auxiliary_loss_clip": 0.01114672, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.05101824, "balance_loss_mlp": 1.01886487, "epoch": 0.29394258229370207, "flos": 37925474382720.0, "grad_norm": 1.8060574020422921, "language_loss": 0.55514884, "learning_rate": 3.3126356250147517e-06, "loss": 0.57664764, "num_input_tokens_seen": 105355605, "step": 4889, "time_per_iteration": 2.838529586791992 }, { "auxiliary_loss_clip": 0.01136079, "auxiliary_loss_mlp": 0.01040242, "balance_loss_clip": 1.05230045, "balance_loss_mlp": 1.02257514, "epoch": 0.29400270554637004, "flos": 20044690197120.0, "grad_norm": 1.9006309093473746, "language_loss": 0.84414017, "learning_rate": 3.3123417573093434e-06, "loss": 0.86590338, "num_input_tokens_seen": 105374225, "step": 4890, "time_per_iteration": 2.653601884841919 }, { "auxiliary_loss_clip": 0.01138833, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.05449104, "balance_loss_mlp": 1.02767992, "epoch": 0.294062828799038, "flos": 15266380775040.0, "grad_norm": 2.3284792525221625, "language_loss": 0.72417939, "learning_rate": 3.3120478398398046e-06, "loss": 0.74600995, "num_input_tokens_seen": 105391565, "step": 4891, "time_per_iteration": 2.6499764919281006 }, { "auxiliary_loss_clip": 0.01148906, "auxiliary_loss_mlp": 0.01046245, "balance_loss_clip": 1.05517375, "balance_loss_mlp": 1.02797008, "epoch": 0.294122952051706, "flos": 22747147799040.0, "grad_norm": 1.6858898954482169, "language_loss": 0.77310836, "learning_rate": 3.3117538726172797e-06, "loss": 0.7950598, "num_input_tokens_seen": 105409840, "step": 4892, "time_per_iteration": 2.6123669147491455 }, { "auxiliary_loss_clip": 0.01143283, "auxiliary_loss_mlp": 0.01036481, "balance_loss_clip": 1.05147183, "balance_loss_mlp": 1.01932704, "epoch": 0.294183075304374, "flos": 24972891073920.0, "grad_norm": 1.8056938004749827, "language_loss": 0.77826709, "learning_rate": 3.3114598556529164e-06, "loss": 0.80006474, "num_input_tokens_seen": 105428645, "step": 4893, "time_per_iteration": 2.6142194271087646 }, { "auxiliary_loss_clip": 0.01106286, "auxiliary_loss_mlp": 0.01045871, "balance_loss_clip": 1.0508399, "balance_loss_mlp": 1.02912164, "epoch": 0.29424319855704195, "flos": 30952986762240.0, "grad_norm": 3.6552959609210944, "language_loss": 0.85032988, "learning_rate": 3.311165788957864e-06, "loss": 0.87185144, "num_input_tokens_seen": 105447480, "step": 4894, "time_per_iteration": 2.837883234024048 }, { "auxiliary_loss_clip": 0.01131513, "auxiliary_loss_mlp": 0.01038131, "balance_loss_clip": 1.05098557, "balance_loss_mlp": 1.02169216, "epoch": 0.2943033218097099, "flos": 15231583474560.0, "grad_norm": 3.570255241204836, "language_loss": 0.90650308, "learning_rate": 3.310871672543274e-06, "loss": 0.92819947, "num_input_tokens_seen": 105464600, "step": 4895, "time_per_iteration": 2.588153839111328 }, { "auxiliary_loss_clip": 0.01138224, "auxiliary_loss_mlp": 0.01045554, "balance_loss_clip": 1.05338621, "balance_loss_mlp": 1.02777958, "epoch": 0.2943634450623779, "flos": 21725884310400.0, "grad_norm": 1.7548452829513195, "language_loss": 0.86612183, "learning_rate": 3.3105775064202982e-06, "loss": 0.88795966, "num_input_tokens_seen": 105481510, "step": 4896, "time_per_iteration": 2.6405279636383057 }, { "auxiliary_loss_clip": 0.01142594, "auxiliary_loss_mlp": 0.01053714, "balance_loss_clip": 1.05662429, "balance_loss_mlp": 1.03620195, "epoch": 0.29442356831504585, "flos": 22602104680320.0, "grad_norm": 2.0549220420715906, "language_loss": 0.73394442, "learning_rate": 3.3102832906000924e-06, "loss": 0.75590742, "num_input_tokens_seen": 105501390, "step": 4897, "time_per_iteration": 2.6669554710388184 }, { "auxiliary_loss_clip": 0.01128563, "auxiliary_loss_mlp": 0.01050668, "balance_loss_clip": 1.04556203, "balance_loss_mlp": 1.03214252, "epoch": 0.2944836915677138, "flos": 20011401267840.0, "grad_norm": 2.0814872266581426, "language_loss": 0.74344778, "learning_rate": 3.309989025093813e-06, "loss": 0.76524007, "num_input_tokens_seen": 105519600, "step": 4898, "time_per_iteration": 2.6286890506744385 }, { "auxiliary_loss_clip": 0.01140269, "auxiliary_loss_mlp": 0.01047883, "balance_loss_clip": 1.05775058, "balance_loss_mlp": 1.02880955, "epoch": 0.2945438148203818, "flos": 20045875345920.0, "grad_norm": 2.610474436320842, "language_loss": 0.70560962, "learning_rate": 3.309694709912618e-06, "loss": 0.72749114, "num_input_tokens_seen": 105535970, "step": 4899, "time_per_iteration": 2.6050777435302734 }, { "auxiliary_loss_clip": 0.01122842, "auxiliary_loss_mlp": 0.00775757, "balance_loss_clip": 1.05115175, "balance_loss_mlp": 1.00110114, "epoch": 0.29460393807304974, "flos": 23733542160000.0, "grad_norm": 2.6981557529788587, "language_loss": 0.78938496, "learning_rate": 3.3094003450676685e-06, "loss": 0.80837095, "num_input_tokens_seen": 105556735, "step": 4900, "time_per_iteration": 2.7517058849334717 }, { "auxiliary_loss_clip": 0.0110429, "auxiliary_loss_mlp": 0.01059395, "balance_loss_clip": 1.04257679, "balance_loss_mlp": 1.03992808, "epoch": 0.2946640613257177, "flos": 14976079056000.0, "grad_norm": 1.7286923709762618, "language_loss": 0.80861294, "learning_rate": 3.3091059305701268e-06, "loss": 0.83024979, "num_input_tokens_seen": 105574875, "step": 4901, "time_per_iteration": 2.58297061920166 }, { "auxiliary_loss_clip": 0.01114064, "auxiliary_loss_mlp": 0.01035256, "balance_loss_clip": 1.05081403, "balance_loss_mlp": 1.01993775, "epoch": 0.2947241845783857, "flos": 24243904552320.0, "grad_norm": 2.2236242529025954, "language_loss": 0.57768303, "learning_rate": 3.308811466431157e-06, "loss": 0.59917623, "num_input_tokens_seen": 105594225, "step": 4902, "time_per_iteration": 2.6765553951263428 }, { "auxiliary_loss_clip": 0.01122886, "auxiliary_loss_mlp": 0.01044406, "balance_loss_clip": 1.05165744, "balance_loss_mlp": 1.02809834, "epoch": 0.29478430783105364, "flos": 19938394874880.0, "grad_norm": 1.6365628527843905, "language_loss": 0.7553789, "learning_rate": 3.308516952661925e-06, "loss": 0.77705181, "num_input_tokens_seen": 105614000, "step": 4903, "time_per_iteration": 5.72201132774353 }, { "auxiliary_loss_clip": 0.01117125, "auxiliary_loss_mlp": 0.01054328, "balance_loss_clip": 1.05058551, "balance_loss_mlp": 1.03506362, "epoch": 0.2948444310837216, "flos": 27381347856000.0, "grad_norm": 1.79479894391178, "language_loss": 0.62782186, "learning_rate": 3.3082223892736e-06, "loss": 0.64953631, "num_input_tokens_seen": 105634575, "step": 4904, "time_per_iteration": 2.7290875911712646 }, { "auxiliary_loss_clip": 0.01135143, "auxiliary_loss_mlp": 0.01043669, "balance_loss_clip": 1.05146813, "balance_loss_mlp": 1.02669382, "epoch": 0.2949045543363896, "flos": 23405462311680.0, "grad_norm": 1.4755442774564356, "language_loss": 0.73145443, "learning_rate": 3.3079277762773496e-06, "loss": 0.75324261, "num_input_tokens_seen": 105654385, "step": 4905, "time_per_iteration": 2.6482555866241455 }, { "auxiliary_loss_clip": 0.01112476, "auxiliary_loss_mlp": 0.01046266, "balance_loss_clip": 1.05017638, "balance_loss_mlp": 1.028265, "epoch": 0.2949646775890576, "flos": 23951483930880.0, "grad_norm": 1.7800977730713317, "language_loss": 0.8199898, "learning_rate": 3.3076331136843476e-06, "loss": 0.84157723, "num_input_tokens_seen": 105673570, "step": 4906, "time_per_iteration": 2.737182378768921 }, { "auxiliary_loss_clip": 0.01094663, "auxiliary_loss_mlp": 0.01040505, "balance_loss_clip": 1.04579425, "balance_loss_mlp": 1.02372003, "epoch": 0.29502480084172555, "flos": 22784315397120.0, "grad_norm": 2.8763815934933867, "language_loss": 0.87373984, "learning_rate": 3.3073384015057667e-06, "loss": 0.89509153, "num_input_tokens_seen": 105691940, "step": 4907, "time_per_iteration": 4.367825746536255 }, { "auxiliary_loss_clip": 0.01149393, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.05400407, "balance_loss_mlp": 1.02501488, "epoch": 0.2950849240943935, "flos": 19646656611840.0, "grad_norm": 2.047818146937445, "language_loss": 0.81910521, "learning_rate": 3.307043639752782e-06, "loss": 0.84103584, "num_input_tokens_seen": 105709825, "step": 4908, "time_per_iteration": 2.578582525253296 }, { "auxiliary_loss_clip": 0.01055582, "auxiliary_loss_mlp": 0.01003419, "balance_loss_clip": 1.02453518, "balance_loss_mlp": 1.00138056, "epoch": 0.2951450473470615, "flos": 71002829260800.0, "grad_norm": 0.7982723827999523, "language_loss": 0.57287854, "learning_rate": 3.3067488284365728e-06, "loss": 0.59346855, "num_input_tokens_seen": 105766880, "step": 4909, "time_per_iteration": 4.640491247177124 }, { "auxiliary_loss_clip": 0.01135445, "auxiliary_loss_mlp": 0.00774301, "balance_loss_clip": 1.05580318, "balance_loss_mlp": 1.00097156, "epoch": 0.29520517059972945, "flos": 22966310632320.0, "grad_norm": 1.756295161453336, "language_loss": 0.87018639, "learning_rate": 3.3064539675683163e-06, "loss": 0.88928384, "num_input_tokens_seen": 105786875, "step": 4910, "time_per_iteration": 2.642312526702881 }, { "auxiliary_loss_clip": 0.01131096, "auxiliary_loss_mlp": 0.0104303, "balance_loss_clip": 1.05359542, "balance_loss_mlp": 1.02744913, "epoch": 0.2952652938523974, "flos": 20485673470080.0, "grad_norm": 1.692596753939278, "language_loss": 0.73332304, "learning_rate": 3.3061590571591946e-06, "loss": 0.75506431, "num_input_tokens_seen": 105805315, "step": 4911, "time_per_iteration": 2.6130573749542236 }, { "auxiliary_loss_clip": 0.01132917, "auxiliary_loss_mlp": 0.01038473, "balance_loss_clip": 1.05330253, "balance_loss_mlp": 1.02193832, "epoch": 0.2953254171050654, "flos": 19646584784640.0, "grad_norm": 1.8009313294920104, "language_loss": 0.89653587, "learning_rate": 3.3058640972203904e-06, "loss": 0.91824973, "num_input_tokens_seen": 105825125, "step": 4912, "time_per_iteration": 2.660090684890747 }, { "auxiliary_loss_clip": 0.01114053, "auxiliary_loss_mlp": 0.010529, "balance_loss_clip": 1.0482899, "balance_loss_mlp": 1.03503084, "epoch": 0.29538554035773334, "flos": 22747973811840.0, "grad_norm": 1.3579869674800176, "language_loss": 0.83175462, "learning_rate": 3.3055690877630894e-06, "loss": 0.85342413, "num_input_tokens_seen": 105846085, "step": 4913, "time_per_iteration": 2.743364095687866 }, { "auxiliary_loss_clip": 0.01142468, "auxiliary_loss_mlp": 0.01043093, "balance_loss_clip": 1.04977608, "balance_loss_mlp": 1.02690446, "epoch": 0.2954456636104013, "flos": 21871861182720.0, "grad_norm": 1.9704695859403116, "language_loss": 0.76919919, "learning_rate": 3.3052740287984765e-06, "loss": 0.79105484, "num_input_tokens_seen": 105865400, "step": 4914, "time_per_iteration": 2.6778385639190674 }, { "auxiliary_loss_clip": 0.01121315, "auxiliary_loss_mlp": 0.01045386, "balance_loss_clip": 1.05064511, "balance_loss_mlp": 1.02818418, "epoch": 0.2955057868630693, "flos": 40442560871040.0, "grad_norm": 1.678810736285401, "language_loss": 0.81829619, "learning_rate": 3.3049789203377424e-06, "loss": 0.8399632, "num_input_tokens_seen": 105887920, "step": 4915, "time_per_iteration": 2.9347212314605713 }, { "auxiliary_loss_clip": 0.01068117, "auxiliary_loss_mlp": 0.01044435, "balance_loss_clip": 1.04405856, "balance_loss_mlp": 1.02722168, "epoch": 0.29556591011573724, "flos": 22564506119040.0, "grad_norm": 2.129336551193515, "language_loss": 0.84701812, "learning_rate": 3.3046837623920772e-06, "loss": 0.86814368, "num_input_tokens_seen": 105904035, "step": 4916, "time_per_iteration": 2.9183273315429688 }, { "auxiliary_loss_clip": 0.01125851, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.04655123, "balance_loss_mlp": 1.01975429, "epoch": 0.2956260333684052, "flos": 22089300163200.0, "grad_norm": 2.1082729468541683, "language_loss": 0.69490808, "learning_rate": 3.3043885549726723e-06, "loss": 0.71653348, "num_input_tokens_seen": 105922685, "step": 4917, "time_per_iteration": 2.7400357723236084 }, { "auxiliary_loss_clip": 0.01123659, "auxiliary_loss_mlp": 0.01038633, "balance_loss_clip": 1.05140972, "balance_loss_mlp": 1.02214622, "epoch": 0.2956861566210732, "flos": 16435488643200.0, "grad_norm": 2.699189623646437, "language_loss": 0.91076934, "learning_rate": 3.3040932980907226e-06, "loss": 0.93239224, "num_input_tokens_seen": 105940425, "step": 4918, "time_per_iteration": 2.7343270778656006 }, { "auxiliary_loss_clip": 0.01147937, "auxiliary_loss_mlp": 0.01043258, "balance_loss_clip": 1.0551039, "balance_loss_mlp": 1.02629495, "epoch": 0.2957462798737412, "flos": 25812087500160.0, "grad_norm": 1.9388581576792214, "language_loss": 0.72399175, "learning_rate": 3.303797991757425e-06, "loss": 0.74590373, "num_input_tokens_seen": 105960550, "step": 4919, "time_per_iteration": 2.718583822250366 }, { "auxiliary_loss_clip": 0.01119627, "auxiliary_loss_mlp": 0.01045651, "balance_loss_clip": 1.04843163, "balance_loss_mlp": 1.02838945, "epoch": 0.29580640312640916, "flos": 16690849407360.0, "grad_norm": 1.8826298231205452, "language_loss": 0.75919485, "learning_rate": 3.3035026359839763e-06, "loss": 0.78084767, "num_input_tokens_seen": 105978820, "step": 4920, "time_per_iteration": 2.7425734996795654 }, { "auxiliary_loss_clip": 0.01121739, "auxiliary_loss_mlp": 0.01052293, "balance_loss_clip": 1.05511427, "balance_loss_mlp": 1.03449547, "epoch": 0.2958665263790771, "flos": 23945594100480.0, "grad_norm": 5.307541834842734, "language_loss": 0.69020098, "learning_rate": 3.3032072307815774e-06, "loss": 0.71194124, "num_input_tokens_seen": 105997545, "step": 4921, "time_per_iteration": 2.7755305767059326 }, { "auxiliary_loss_clip": 0.01120164, "auxiliary_loss_mlp": 0.01043, "balance_loss_clip": 1.05075121, "balance_loss_mlp": 1.02453458, "epoch": 0.2959266496317451, "flos": 18478410670080.0, "grad_norm": 1.8488664920888758, "language_loss": 0.7462194, "learning_rate": 3.3029117761614298e-06, "loss": 0.767851, "num_input_tokens_seen": 106015320, "step": 4922, "time_per_iteration": 2.740687131881714 }, { "auxiliary_loss_clip": 0.01152013, "auxiliary_loss_mlp": 0.00775382, "balance_loss_clip": 1.05429566, "balance_loss_mlp": 1.00129843, "epoch": 0.29598677288441305, "flos": 25957489754880.0, "grad_norm": 1.7662799143188246, "language_loss": 0.77148855, "learning_rate": 3.302616272134737e-06, "loss": 0.79076254, "num_input_tokens_seen": 106034555, "step": 4923, "time_per_iteration": 2.664875030517578 }, { "auxiliary_loss_clip": 0.01117655, "auxiliary_loss_mlp": 0.01042537, "balance_loss_clip": 1.05065989, "balance_loss_mlp": 1.0247035, "epoch": 0.296046896137081, "flos": 25155999630720.0, "grad_norm": 1.7775190737024398, "language_loss": 0.86232758, "learning_rate": 3.3023207187127042e-06, "loss": 0.88392955, "num_input_tokens_seen": 106054200, "step": 4924, "time_per_iteration": 2.7413501739501953 }, { "auxiliary_loss_clip": 0.01132544, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.05098939, "balance_loss_mlp": 1.02114248, "epoch": 0.296107019389749, "flos": 21761148487680.0, "grad_norm": 1.479657736715748, "language_loss": 0.82050943, "learning_rate": 3.3020251159065396e-06, "loss": 0.84221852, "num_input_tokens_seen": 106074700, "step": 4925, "time_per_iteration": 2.676556348800659 }, { "auxiliary_loss_clip": 0.01078547, "auxiliary_loss_mlp": 0.01051683, "balance_loss_clip": 1.04153097, "balance_loss_mlp": 1.03283572, "epoch": 0.29616714264241695, "flos": 17960039544960.0, "grad_norm": 2.5440905583969697, "language_loss": 0.86138272, "learning_rate": 3.301729463727452e-06, "loss": 0.88268495, "num_input_tokens_seen": 106091415, "step": 4926, "time_per_iteration": 2.675780773162842 }, { "auxiliary_loss_clip": 0.01108502, "auxiliary_loss_mlp": 0.01035423, "balance_loss_clip": 1.04910469, "balance_loss_mlp": 1.0193243, "epoch": 0.2962272658950849, "flos": 15012779777280.0, "grad_norm": 2.332235960138756, "language_loss": 0.85897464, "learning_rate": 3.3014337621866527e-06, "loss": 0.88041389, "num_input_tokens_seen": 106109135, "step": 4927, "time_per_iteration": 2.7407169342041016 }, { "auxiliary_loss_clip": 0.01131541, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.05158448, "balance_loss_mlp": 1.02312613, "epoch": 0.2962873891477529, "flos": 14720861946240.0, "grad_norm": 3.581765820174834, "language_loss": 0.80772752, "learning_rate": 3.3011380112953553e-06, "loss": 0.8294366, "num_input_tokens_seen": 106125750, "step": 4928, "time_per_iteration": 2.6719777584075928 }, { "auxiliary_loss_clip": 0.01123889, "auxiliary_loss_mlp": 0.01043191, "balance_loss_clip": 1.04852009, "balance_loss_mlp": 1.02346206, "epoch": 0.29634751240042084, "flos": 26723787528960.0, "grad_norm": 2.79065826833615, "language_loss": 0.7313869, "learning_rate": 3.300842211064773e-06, "loss": 0.75305772, "num_input_tokens_seen": 106142835, "step": 4929, "time_per_iteration": 2.75266695022583 }, { "auxiliary_loss_clip": 0.0112132, "auxiliary_loss_mlp": 0.01054118, "balance_loss_clip": 1.0495156, "balance_loss_mlp": 1.03481805, "epoch": 0.2964076356530888, "flos": 14571293713920.0, "grad_norm": 2.360375509218164, "language_loss": 0.71534413, "learning_rate": 3.3005463615061246e-06, "loss": 0.73709846, "num_input_tokens_seen": 106160680, "step": 4930, "time_per_iteration": 2.799149990081787 }, { "auxiliary_loss_clip": 0.01028509, "auxiliary_loss_mlp": 0.01003992, "balance_loss_clip": 1.03094876, "balance_loss_mlp": 1.00229919, "epoch": 0.29646775890575683, "flos": 63104315063040.0, "grad_norm": 0.8053244370028285, "language_loss": 0.6061247, "learning_rate": 3.3002504626306275e-06, "loss": 0.6264497, "num_input_tokens_seen": 106224415, "step": 4931, "time_per_iteration": 3.218900442123413 }, { "auxiliary_loss_clip": 0.01007041, "auxiliary_loss_mlp": 0.01005936, "balance_loss_clip": 1.02247667, "balance_loss_mlp": 1.00395727, "epoch": 0.2965278821584248, "flos": 63067686168960.0, "grad_norm": 0.7408573754586586, "language_loss": 0.52380091, "learning_rate": 3.2999545144495023e-06, "loss": 0.54393071, "num_input_tokens_seen": 106279140, "step": 4932, "time_per_iteration": 3.26432728767395 }, { "auxiliary_loss_clip": 0.01129633, "auxiliary_loss_mlp": 0.01042438, "balance_loss_clip": 1.04917526, "balance_loss_mlp": 1.02584457, "epoch": 0.29658800541109276, "flos": 23768734510080.0, "grad_norm": 2.012094119717185, "language_loss": 0.81540775, "learning_rate": 3.299658516973972e-06, "loss": 0.83712846, "num_input_tokens_seen": 106298190, "step": 4933, "time_per_iteration": 2.804293155670166 }, { "auxiliary_loss_clip": 0.01092845, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.04405773, "balance_loss_mlp": 1.01966333, "epoch": 0.2966481286637607, "flos": 23988543788160.0, "grad_norm": 1.916542141573101, "language_loss": 0.75165296, "learning_rate": 3.299362470215261e-06, "loss": 0.77295041, "num_input_tokens_seen": 106319065, "step": 4934, "time_per_iteration": 2.797697067260742 }, { "auxiliary_loss_clip": 0.01126398, "auxiliary_loss_mlp": 0.01047716, "balance_loss_clip": 1.04985118, "balance_loss_mlp": 1.03013301, "epoch": 0.2967082519164287, "flos": 17165157523200.0, "grad_norm": 1.8491505675561635, "language_loss": 0.62093496, "learning_rate": 3.299066374184594e-06, "loss": 0.64267612, "num_input_tokens_seen": 106338040, "step": 4935, "time_per_iteration": 2.6466407775878906 }, { "auxiliary_loss_clip": 0.01129018, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.05052114, "balance_loss_mlp": 1.02452123, "epoch": 0.29676837516909665, "flos": 29387712816000.0, "grad_norm": 1.4269626202910053, "language_loss": 0.79485404, "learning_rate": 3.2987702288932e-06, "loss": 0.81656075, "num_input_tokens_seen": 106358900, "step": 4936, "time_per_iteration": 2.7333009243011475 }, { "auxiliary_loss_clip": 0.01100808, "auxiliary_loss_mlp": 0.01048756, "balance_loss_clip": 1.04970682, "balance_loss_mlp": 1.03040934, "epoch": 0.2968284984217646, "flos": 34751222616960.0, "grad_norm": 1.5951903019521643, "language_loss": 0.73993498, "learning_rate": 3.298474034352309e-06, "loss": 0.76143062, "num_input_tokens_seen": 106381805, "step": 4937, "time_per_iteration": 2.853935718536377 }, { "auxiliary_loss_clip": 0.01094789, "auxiliary_loss_mlp": 0.01038743, "balance_loss_clip": 1.05060768, "balance_loss_mlp": 1.0209924, "epoch": 0.2968886216744326, "flos": 21544104556800.0, "grad_norm": 1.654578873057457, "language_loss": 0.78373563, "learning_rate": 3.2981777905731526e-06, "loss": 0.80507094, "num_input_tokens_seen": 106402365, "step": 4938, "time_per_iteration": 2.803147077560425 }, { "auxiliary_loss_clip": 0.0111878, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.05193913, "balance_loss_mlp": 1.02931857, "epoch": 0.29694874492710055, "flos": 12787323811200.0, "grad_norm": 2.4827377035181013, "language_loss": 0.76842266, "learning_rate": 3.297881497566964e-06, "loss": 0.79009068, "num_input_tokens_seen": 106419800, "step": 4939, "time_per_iteration": 2.8867270946502686 }, { "auxiliary_loss_clip": 0.0111051, "auxiliary_loss_mlp": 0.01041172, "balance_loss_clip": 1.04666841, "balance_loss_mlp": 1.02361226, "epoch": 0.2970088681797685, "flos": 24569973239040.0, "grad_norm": 1.8055035581570296, "language_loss": 0.78354549, "learning_rate": 3.297585155344979e-06, "loss": 0.80506229, "num_input_tokens_seen": 106440300, "step": 4940, "time_per_iteration": 2.783046245574951 }, { "auxiliary_loss_clip": 0.01117762, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.0486958, "balance_loss_mlp": 1.01876736, "epoch": 0.2970689914324365, "flos": 23659171050240.0, "grad_norm": 1.6305550110852276, "language_loss": 0.75628781, "learning_rate": 3.297288763918435e-06, "loss": 0.77784479, "num_input_tokens_seen": 106460035, "step": 4941, "time_per_iteration": 2.74379825592041 }, { "auxiliary_loss_clip": 0.01138083, "auxiliary_loss_mlp": 0.01051629, "balance_loss_clip": 1.05272233, "balance_loss_mlp": 1.03276968, "epoch": 0.29712911468510445, "flos": 39670301439360.0, "grad_norm": 2.3053326725865313, "language_loss": 0.74158287, "learning_rate": 3.2969923232985712e-06, "loss": 0.76347995, "num_input_tokens_seen": 106481095, "step": 4942, "time_per_iteration": 4.468350410461426 }, { "auxiliary_loss_clip": 0.01111068, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.05172181, "balance_loss_mlp": 1.02589595, "epoch": 0.2971892379377724, "flos": 26395312631040.0, "grad_norm": 2.42728921351593, "language_loss": 0.702492, "learning_rate": 3.2966958334966287e-06, "loss": 0.72404563, "num_input_tokens_seen": 106501590, "step": 4943, "time_per_iteration": 4.2555251121521 }, { "auxiliary_loss_clip": 0.01124177, "auxiliary_loss_mlp": 0.01041442, "balance_loss_clip": 1.04988825, "balance_loss_mlp": 1.02360821, "epoch": 0.2972493611904404, "flos": 17603195880960.0, "grad_norm": 2.221197725988377, "language_loss": 0.795506, "learning_rate": 3.2963992945238497e-06, "loss": 0.81716216, "num_input_tokens_seen": 106519430, "step": 4944, "time_per_iteration": 2.6572201251983643 }, { "auxiliary_loss_clip": 0.0111705, "auxiliary_loss_mlp": 0.01041351, "balance_loss_clip": 1.04914248, "balance_loss_mlp": 1.02521038, "epoch": 0.2973094844431084, "flos": 20412774817920.0, "grad_norm": 2.187472317578873, "language_loss": 0.83260202, "learning_rate": 3.2961027063914795e-06, "loss": 0.85418606, "num_input_tokens_seen": 106535870, "step": 4945, "time_per_iteration": 2.6700363159179688 }, { "auxiliary_loss_clip": 0.01090371, "auxiliary_loss_mlp": 0.01039575, "balance_loss_clip": 1.04623246, "balance_loss_mlp": 1.02256417, "epoch": 0.29736960769577636, "flos": 17493488766720.0, "grad_norm": 1.8830005833778707, "language_loss": 0.67067397, "learning_rate": 3.2958060691107654e-06, "loss": 0.69197345, "num_input_tokens_seen": 106553560, "step": 4946, "time_per_iteration": 4.29357385635376 }, { "auxiliary_loss_clip": 0.01127819, "auxiliary_loss_mlp": 0.00777134, "balance_loss_clip": 1.04997563, "balance_loss_mlp": 1.00115252, "epoch": 0.2974297309484443, "flos": 26103969417600.0, "grad_norm": 1.879721590970614, "language_loss": 0.73877805, "learning_rate": 3.2955093826929547e-06, "loss": 0.75782764, "num_input_tokens_seen": 106574115, "step": 4947, "time_per_iteration": 2.657038450241089 }, { "auxiliary_loss_clip": 0.01109701, "auxiliary_loss_mlp": 0.01045546, "balance_loss_clip": 1.04896843, "balance_loss_mlp": 1.02705622, "epoch": 0.2974898542011123, "flos": 25666433850240.0, "grad_norm": 2.0989098852090633, "language_loss": 0.73522758, "learning_rate": 3.2952126471492985e-06, "loss": 0.75678003, "num_input_tokens_seen": 106593070, "step": 4948, "time_per_iteration": 4.4359636306762695 }, { "auxiliary_loss_clip": 0.01139863, "auxiliary_loss_mlp": 0.01040301, "balance_loss_clip": 1.04885721, "balance_loss_mlp": 1.02332592, "epoch": 0.29754997745378026, "flos": 18661339658880.0, "grad_norm": 2.06615582769113, "language_loss": 0.8397494, "learning_rate": 3.2949158624910497e-06, "loss": 0.86155105, "num_input_tokens_seen": 106610695, "step": 4949, "time_per_iteration": 2.6052157878875732 }, { "auxiliary_loss_clip": 0.01128522, "auxiliary_loss_mlp": 0.01041578, "balance_loss_clip": 1.04901218, "balance_loss_mlp": 1.02459633, "epoch": 0.2976101007064482, "flos": 22274599449600.0, "grad_norm": 2.2184783420455814, "language_loss": 0.71360326, "learning_rate": 3.2946190287294603e-06, "loss": 0.73530424, "num_input_tokens_seen": 106631300, "step": 4950, "time_per_iteration": 2.678953170776367 }, { "auxiliary_loss_clip": 0.01095366, "auxiliary_loss_mlp": 0.01039981, "balance_loss_clip": 1.04944646, "balance_loss_mlp": 1.0239712, "epoch": 0.2976702239591162, "flos": 21945657674880.0, "grad_norm": 3.098719098855731, "language_loss": 0.82645297, "learning_rate": 3.294322145875789e-06, "loss": 0.84780639, "num_input_tokens_seen": 106650065, "step": 4951, "time_per_iteration": 2.7566003799438477 }, { "auxiliary_loss_clip": 0.01118264, "auxiliary_loss_mlp": 0.01039186, "balance_loss_clip": 1.04655933, "balance_loss_mlp": 1.02190065, "epoch": 0.29773034721178415, "flos": 24637197542400.0, "grad_norm": 15.690000260498868, "language_loss": 0.74144769, "learning_rate": 3.2940252139412912e-06, "loss": 0.76302218, "num_input_tokens_seen": 106668230, "step": 4952, "time_per_iteration": 2.7019882202148438 }, { "auxiliary_loss_clip": 0.01063128, "auxiliary_loss_mlp": 0.01049349, "balance_loss_clip": 1.0433315, "balance_loss_mlp": 1.03133702, "epoch": 0.2977904704644521, "flos": 20557566541440.0, "grad_norm": 1.6701113978494808, "language_loss": 0.84251344, "learning_rate": 3.293728232937228e-06, "loss": 0.86363828, "num_input_tokens_seen": 106687785, "step": 4953, "time_per_iteration": 2.9622793197631836 }, { "auxiliary_loss_clip": 0.01120636, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.04966831, "balance_loss_mlp": 1.02428031, "epoch": 0.2978505937171201, "flos": 18916449027840.0, "grad_norm": 2.301918041259246, "language_loss": 0.74366152, "learning_rate": 3.2934312028748597e-06, "loss": 0.76527375, "num_input_tokens_seen": 106706875, "step": 4954, "time_per_iteration": 2.767455577850342 }, { "auxiliary_loss_clip": 0.01138563, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.04899216, "balance_loss_mlp": 1.02028275, "epoch": 0.29791071696978805, "flos": 19317750750720.0, "grad_norm": 2.0603039788066155, "language_loss": 0.75687683, "learning_rate": 3.293134123765452e-06, "loss": 0.77862525, "num_input_tokens_seen": 106725105, "step": 4955, "time_per_iteration": 2.638389825820923 }, { "auxiliary_loss_clip": 0.01094257, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.04760742, "balance_loss_mlp": 1.02505171, "epoch": 0.297970840222456, "flos": 18806813740800.0, "grad_norm": 2.358195616275362, "language_loss": 0.72600436, "learning_rate": 3.2928369956202684e-06, "loss": 0.74737054, "num_input_tokens_seen": 106744780, "step": 4956, "time_per_iteration": 2.777873992919922 }, { "auxiliary_loss_clip": 0.01134603, "auxiliary_loss_mlp": 0.0104754, "balance_loss_clip": 1.04957581, "balance_loss_mlp": 1.02930105, "epoch": 0.298030963475124, "flos": 22852760762880.0, "grad_norm": 2.0297274127598435, "language_loss": 0.79068756, "learning_rate": 3.2925398184505754e-06, "loss": 0.81250894, "num_input_tokens_seen": 106764670, "step": 4957, "time_per_iteration": 2.719581365585327 }, { "auxiliary_loss_clip": 0.01134843, "auxiliary_loss_mlp": 0.01041974, "balance_loss_clip": 1.05054235, "balance_loss_mlp": 1.02383018, "epoch": 0.298091086727792, "flos": 21868485304320.0, "grad_norm": 1.706880580606115, "language_loss": 0.70570725, "learning_rate": 3.2922425922676437e-06, "loss": 0.7274754, "num_input_tokens_seen": 106783695, "step": 4958, "time_per_iteration": 2.613697052001953 }, { "auxiliary_loss_clip": 0.01108077, "auxiliary_loss_mlp": 0.0104267, "balance_loss_clip": 1.05166888, "balance_loss_mlp": 1.0253129, "epoch": 0.29815120998045996, "flos": 21175014355200.0, "grad_norm": 1.5383051389102413, "language_loss": 0.78736448, "learning_rate": 3.291945317082743e-06, "loss": 0.80887192, "num_input_tokens_seen": 106803150, "step": 4959, "time_per_iteration": 2.751455545425415 }, { "auxiliary_loss_clip": 0.01129828, "auxiliary_loss_mlp": 0.01045919, "balance_loss_clip": 1.04906321, "balance_loss_mlp": 1.0290029, "epoch": 0.29821133323312793, "flos": 19896271200000.0, "grad_norm": 1.6624120752671379, "language_loss": 0.79747117, "learning_rate": 3.291647992907147e-06, "loss": 0.81922865, "num_input_tokens_seen": 106820705, "step": 4960, "time_per_iteration": 2.6345505714416504 }, { "auxiliary_loss_clip": 0.01110987, "auxiliary_loss_mlp": 0.01052912, "balance_loss_clip": 1.04863763, "balance_loss_mlp": 1.03449416, "epoch": 0.2982714564857959, "flos": 12750766744320.0, "grad_norm": 2.376132196895137, "language_loss": 0.73364639, "learning_rate": 3.291350619752129e-06, "loss": 0.75528538, "num_input_tokens_seen": 106837335, "step": 4961, "time_per_iteration": 2.725008010864258 }, { "auxiliary_loss_clip": 0.01130001, "auxiliary_loss_mlp": 0.0104294, "balance_loss_clip": 1.04824948, "balance_loss_mlp": 1.02640533, "epoch": 0.29833157973846386, "flos": 22271905929600.0, "grad_norm": 2.036560430862295, "language_loss": 0.62106621, "learning_rate": 3.291053197628967e-06, "loss": 0.64279556, "num_input_tokens_seen": 106856250, "step": 4962, "time_per_iteration": 2.690870523452759 }, { "auxiliary_loss_clip": 0.01128362, "auxiliary_loss_mlp": 0.01051341, "balance_loss_clip": 1.05034256, "balance_loss_mlp": 1.03310251, "epoch": 0.2983917029911318, "flos": 15372999319680.0, "grad_norm": 2.046461333274312, "language_loss": 0.82866591, "learning_rate": 3.2907557265489375e-06, "loss": 0.85046291, "num_input_tokens_seen": 106873370, "step": 4963, "time_per_iteration": 2.637723207473755 }, { "auxiliary_loss_clip": 0.01112844, "auxiliary_loss_mlp": 0.01044675, "balance_loss_clip": 1.05338502, "balance_loss_mlp": 1.0272826, "epoch": 0.2984518262437998, "flos": 15377632174080.0, "grad_norm": 2.580714695656121, "language_loss": 0.65933317, "learning_rate": 3.290458206523322e-06, "loss": 0.68090838, "num_input_tokens_seen": 106890330, "step": 4964, "time_per_iteration": 2.7210114002227783 }, { "auxiliary_loss_clip": 0.01128428, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.04990005, "balance_loss_mlp": 1.02345669, "epoch": 0.29851194949646775, "flos": 18108458542080.0, "grad_norm": 1.8191471944851214, "language_loss": 0.71093529, "learning_rate": 3.2901606375634015e-06, "loss": 0.73261172, "num_input_tokens_seen": 106909190, "step": 4965, "time_per_iteration": 2.7070064544677734 }, { "auxiliary_loss_clip": 0.01151396, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.05813003, "balance_loss_mlp": 1.03827357, "epoch": 0.2985720727491357, "flos": 22018233104640.0, "grad_norm": 2.164601494744612, "language_loss": 0.65952027, "learning_rate": 3.289863019680461e-06, "loss": 0.68159783, "num_input_tokens_seen": 106927825, "step": 4966, "time_per_iteration": 2.5820860862731934 }, { "auxiliary_loss_clip": 0.01148496, "auxiliary_loss_mlp": 0.01042183, "balance_loss_clip": 1.05610132, "balance_loss_mlp": 1.02496934, "epoch": 0.2986321960018037, "flos": 13041355772160.0, "grad_norm": 5.631297794621363, "language_loss": 0.73553479, "learning_rate": 3.289565352885785e-06, "loss": 0.75744158, "num_input_tokens_seen": 106943155, "step": 4967, "time_per_iteration": 2.558378219604492 }, { "auxiliary_loss_clip": 0.01110231, "auxiliary_loss_mlp": 0.01041561, "balance_loss_clip": 1.04339898, "balance_loss_mlp": 1.02440643, "epoch": 0.29869231925447165, "flos": 14465034305280.0, "grad_norm": 2.07351823246568, "language_loss": 0.71246195, "learning_rate": 3.2892676371906614e-06, "loss": 0.73397982, "num_input_tokens_seen": 106960295, "step": 4968, "time_per_iteration": 2.663163900375366 }, { "auxiliary_loss_clip": 0.01124763, "auxiliary_loss_mlp": 0.01043588, "balance_loss_clip": 1.04864979, "balance_loss_mlp": 1.02545607, "epoch": 0.2987524425071396, "flos": 31650228639360.0, "grad_norm": 2.159507035183752, "language_loss": 0.76744419, "learning_rate": 3.2889698726063805e-06, "loss": 0.78912771, "num_input_tokens_seen": 106982870, "step": 4969, "time_per_iteration": 2.729922294616699 }, { "auxiliary_loss_clip": 0.0114364, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.05239987, "balance_loss_mlp": 1.02054322, "epoch": 0.2988125657598076, "flos": 21433427775360.0, "grad_norm": 2.2724385668179936, "language_loss": 0.69836891, "learning_rate": 3.2886720591442327e-06, "loss": 0.72016788, "num_input_tokens_seen": 107002405, "step": 4970, "time_per_iteration": 2.6299381256103516 }, { "auxiliary_loss_clip": 0.01135061, "auxiliary_loss_mlp": 0.01048009, "balance_loss_clip": 1.05199289, "balance_loss_mlp": 1.02973413, "epoch": 0.2988726890124756, "flos": 18076965292800.0, "grad_norm": 2.0648779209654258, "language_loss": 0.85228848, "learning_rate": 3.2883741968155103e-06, "loss": 0.87411916, "num_input_tokens_seen": 107017310, "step": 4971, "time_per_iteration": 2.6508536338806152 }, { "auxiliary_loss_clip": 0.01112297, "auxiliary_loss_mlp": 0.01054091, "balance_loss_clip": 1.04895663, "balance_loss_mlp": 1.03510106, "epoch": 0.29893281226514357, "flos": 21755653706880.0, "grad_norm": 2.125047221260382, "language_loss": 0.79404521, "learning_rate": 3.2880762856315107e-06, "loss": 0.81570905, "num_input_tokens_seen": 107034645, "step": 4972, "time_per_iteration": 2.7924270629882812 }, { "auxiliary_loss_clip": 0.01145651, "auxiliary_loss_mlp": 0.01050789, "balance_loss_clip": 1.05367875, "balance_loss_mlp": 1.03427887, "epoch": 0.29899293551781153, "flos": 16836718538880.0, "grad_norm": 2.200462139835186, "language_loss": 0.85242772, "learning_rate": 3.2877783256035285e-06, "loss": 0.87439215, "num_input_tokens_seen": 107051125, "step": 4973, "time_per_iteration": 2.5249850749969482 }, { "auxiliary_loss_clip": 0.011108, "auxiliary_loss_mlp": 0.0104405, "balance_loss_clip": 1.04758012, "balance_loss_mlp": 1.02664554, "epoch": 0.2990530587704795, "flos": 11729215946880.0, "grad_norm": 2.0029664307268664, "language_loss": 0.77612329, "learning_rate": 3.287480316742863e-06, "loss": 0.79767179, "num_input_tokens_seen": 107068815, "step": 4974, "time_per_iteration": 2.6555633544921875 }, { "auxiliary_loss_clip": 0.01115732, "auxiliary_loss_mlp": 0.00779073, "balance_loss_clip": 1.04864824, "balance_loss_mlp": 1.00132942, "epoch": 0.29911318202314746, "flos": 28039877850240.0, "grad_norm": 1.735885031779611, "language_loss": 0.72557616, "learning_rate": 3.287182259060815e-06, "loss": 0.74452424, "num_input_tokens_seen": 107090420, "step": 4975, "time_per_iteration": 2.826773166656494 }, { "auxiliary_loss_clip": 0.01137332, "auxiliary_loss_mlp": 0.01043625, "balance_loss_clip": 1.05628741, "balance_loss_mlp": 1.02561235, "epoch": 0.2991733052758154, "flos": 18733555952640.0, "grad_norm": 2.282255680734404, "language_loss": 0.76357341, "learning_rate": 3.286884152568687e-06, "loss": 0.78538299, "num_input_tokens_seen": 107107255, "step": 4976, "time_per_iteration": 2.7506988048553467 }, { "auxiliary_loss_clip": 0.01130399, "auxiliary_loss_mlp": 0.01046525, "balance_loss_clip": 1.0515976, "balance_loss_mlp": 1.02988303, "epoch": 0.2992334285284834, "flos": 15559160532480.0, "grad_norm": 2.005019372487673, "language_loss": 0.86173046, "learning_rate": 3.2865859972777827e-06, "loss": 0.88349968, "num_input_tokens_seen": 107123840, "step": 4977, "time_per_iteration": 2.665029764175415 }, { "auxiliary_loss_clip": 0.01118345, "auxiliary_loss_mlp": 0.01041325, "balance_loss_clip": 1.05032945, "balance_loss_mlp": 1.02443314, "epoch": 0.29929355178115136, "flos": 21797561900160.0, "grad_norm": 1.7658271873172786, "language_loss": 0.68290305, "learning_rate": 3.2862877931994088e-06, "loss": 0.70449972, "num_input_tokens_seen": 107143475, "step": 4978, "time_per_iteration": 2.8401222229003906 }, { "auxiliary_loss_clip": 0.011259, "auxiliary_loss_mlp": 0.0104045, "balance_loss_clip": 1.05556107, "balance_loss_mlp": 1.02268767, "epoch": 0.2993536750338193, "flos": 21178533888000.0, "grad_norm": 2.254262103488659, "language_loss": 0.76281357, "learning_rate": 3.2859895403448726e-06, "loss": 0.78447711, "num_input_tokens_seen": 107161725, "step": 4979, "time_per_iteration": 2.7814600467681885 }, { "auxiliary_loss_clip": 0.01090165, "auxiliary_loss_mlp": 0.0104942, "balance_loss_clip": 1.04378402, "balance_loss_mlp": 1.03001285, "epoch": 0.2994137982864873, "flos": 32122130544000.0, "grad_norm": 2.1261514095664253, "language_loss": 0.68627954, "learning_rate": 3.285691238725484e-06, "loss": 0.70767546, "num_input_tokens_seen": 107183935, "step": 4980, "time_per_iteration": 2.891620635986328 }, { "auxiliary_loss_clip": 0.01130184, "auxiliary_loss_mlp": 0.00774942, "balance_loss_clip": 1.0525018, "balance_loss_mlp": 1.00121665, "epoch": 0.29947392153915525, "flos": 21105419754240.0, "grad_norm": 2.1372298066204114, "language_loss": 0.73153281, "learning_rate": 3.285392888352555e-06, "loss": 0.75058407, "num_input_tokens_seen": 107204285, "step": 4981, "time_per_iteration": 5.394481420516968 }, { "auxiliary_loss_clip": 0.01131964, "auxiliary_loss_mlp": 0.0103921, "balance_loss_clip": 1.0491364, "balance_loss_mlp": 1.02280653, "epoch": 0.2995340447918232, "flos": 21542632099200.0, "grad_norm": 1.6530173596529, "language_loss": 0.86516619, "learning_rate": 3.2850944892373987e-06, "loss": 0.88687789, "num_input_tokens_seen": 107225265, "step": 4982, "time_per_iteration": 4.269104480743408 }, { "auxiliary_loss_clip": 0.01122605, "auxiliary_loss_mlp": 0.01045235, "balance_loss_clip": 1.05186415, "balance_loss_mlp": 1.02632844, "epoch": 0.2995941680444912, "flos": 16725143917440.0, "grad_norm": 2.446225936700185, "language_loss": 0.86517423, "learning_rate": 3.2847960413913307e-06, "loss": 0.88685262, "num_input_tokens_seen": 107241335, "step": 4983, "time_per_iteration": 2.844748020172119 }, { "auxiliary_loss_clip": 0.01127565, "auxiliary_loss_mlp": 0.01041992, "balance_loss_clip": 1.05255556, "balance_loss_mlp": 1.02594662, "epoch": 0.2996542912971592, "flos": 20923496346240.0, "grad_norm": 2.024163877740881, "language_loss": 0.78712893, "learning_rate": 3.284497544825668e-06, "loss": 0.80882448, "num_input_tokens_seen": 107259375, "step": 4984, "time_per_iteration": 2.6945550441741943 }, { "auxiliary_loss_clip": 0.01110139, "auxiliary_loss_mlp": 0.01046002, "balance_loss_clip": 1.0492574, "balance_loss_mlp": 1.02761972, "epoch": 0.29971441454982717, "flos": 25079868754560.0, "grad_norm": 1.5529534411437271, "language_loss": 0.78736818, "learning_rate": 3.2841989995517303e-06, "loss": 0.8089295, "num_input_tokens_seen": 107279890, "step": 4985, "time_per_iteration": 2.8082690238952637 }, { "auxiliary_loss_clip": 0.01083189, "auxiliary_loss_mlp": 0.01050178, "balance_loss_clip": 1.04330277, "balance_loss_mlp": 1.02925658, "epoch": 0.29977453780249513, "flos": 52555911840000.0, "grad_norm": 2.2301347819864112, "language_loss": 0.72089684, "learning_rate": 3.283900405580837e-06, "loss": 0.74223053, "num_input_tokens_seen": 107303430, "step": 4986, "time_per_iteration": 4.54891562461853 }, { "auxiliary_loss_clip": 0.01119419, "auxiliary_loss_mlp": 0.01047564, "balance_loss_clip": 1.04838538, "balance_loss_mlp": 1.03007603, "epoch": 0.2998346610551631, "flos": 22237144542720.0, "grad_norm": 2.1453051702670787, "language_loss": 0.73143345, "learning_rate": 3.283601762924312e-06, "loss": 0.75310332, "num_input_tokens_seen": 107323700, "step": 4987, "time_per_iteration": 4.324375152587891 }, { "auxiliary_loss_clip": 0.01111213, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.04803324, "balance_loss_mlp": 1.0233314, "epoch": 0.29989478430783106, "flos": 16873203778560.0, "grad_norm": 2.095598578062247, "language_loss": 0.80221194, "learning_rate": 3.2833030715934793e-06, "loss": 0.82371509, "num_input_tokens_seen": 107341965, "step": 4988, "time_per_iteration": 2.772221565246582 }, { "auxiliary_loss_clip": 0.01114945, "auxiliary_loss_mlp": 0.00777889, "balance_loss_clip": 1.04905486, "balance_loss_mlp": 1.0013597, "epoch": 0.29995490756049903, "flos": 23768878164480.0, "grad_norm": 1.6966696236855432, "language_loss": 0.70858777, "learning_rate": 3.2830043315996658e-06, "loss": 0.72751617, "num_input_tokens_seen": 107362615, "step": 4989, "time_per_iteration": 2.7470130920410156 }, { "auxiliary_loss_clip": 0.0110827, "auxiliary_loss_mlp": 0.01046589, "balance_loss_clip": 1.0506041, "balance_loss_mlp": 1.02906489, "epoch": 0.300015030813167, "flos": 14465321614080.0, "grad_norm": 1.9545100728262668, "language_loss": 0.85589516, "learning_rate": 3.282705542954199e-06, "loss": 0.87744367, "num_input_tokens_seen": 107378980, "step": 4990, "time_per_iteration": 2.808276414871216 }, { "auxiliary_loss_clip": 0.01133569, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.05172086, "balance_loss_mlp": 1.02152538, "epoch": 0.30007515406583496, "flos": 25191982080000.0, "grad_norm": 1.8023870470649808, "language_loss": 0.67019355, "learning_rate": 3.28240670566841e-06, "loss": 0.69192666, "num_input_tokens_seen": 107397640, "step": 4991, "time_per_iteration": 2.7097268104553223 }, { "auxiliary_loss_clip": 0.0112021, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.04660511, "balance_loss_mlp": 1.02248883, "epoch": 0.3001352773185029, "flos": 19391188106880.0, "grad_norm": 1.684252307124257, "language_loss": 0.78640115, "learning_rate": 3.28210781975363e-06, "loss": 0.80801708, "num_input_tokens_seen": 107416020, "step": 4992, "time_per_iteration": 2.66925311088562 }, { "auxiliary_loss_clip": 0.01143243, "auxiliary_loss_mlp": 0.01041924, "balance_loss_clip": 1.05240428, "balance_loss_mlp": 1.02457952, "epoch": 0.3001954005711709, "flos": 21543853161600.0, "grad_norm": 2.3134173579188175, "language_loss": 0.82057947, "learning_rate": 3.281808885221193e-06, "loss": 0.84243113, "num_input_tokens_seen": 107436340, "step": 4993, "time_per_iteration": 2.613849639892578 }, { "auxiliary_loss_clip": 0.01096023, "auxiliary_loss_mlp": 0.01048917, "balance_loss_clip": 1.04667079, "balance_loss_mlp": 1.02997458, "epoch": 0.30025552382383885, "flos": 17384320356480.0, "grad_norm": 2.1042579138834197, "language_loss": 0.86142659, "learning_rate": 3.2815099020824345e-06, "loss": 0.88287598, "num_input_tokens_seen": 107454585, "step": 4994, "time_per_iteration": 2.703126907348633 }, { "auxiliary_loss_clip": 0.01118329, "auxiliary_loss_mlp": 0.01041975, "balance_loss_clip": 1.05592799, "balance_loss_mlp": 1.02504694, "epoch": 0.3003156470765068, "flos": 29533330552320.0, "grad_norm": 1.5905866784601752, "language_loss": 0.80834931, "learning_rate": 3.2812108703486924e-06, "loss": 0.82995236, "num_input_tokens_seen": 107477180, "step": 4995, "time_per_iteration": 2.8100333213806152 }, { "auxiliary_loss_clip": 0.01117939, "auxiliary_loss_mlp": 0.01043612, "balance_loss_clip": 1.05073023, "balance_loss_mlp": 1.02623129, "epoch": 0.3003757703291748, "flos": 43646402465280.0, "grad_norm": 1.9490007813217745, "language_loss": 0.67086798, "learning_rate": 3.2809117900313055e-06, "loss": 0.69248348, "num_input_tokens_seen": 107500250, "step": 4996, "time_per_iteration": 2.989062786102295 }, { "auxiliary_loss_clip": 0.01114657, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.04888701, "balance_loss_mlp": 1.02449584, "epoch": 0.30043589358184275, "flos": 22528380015360.0, "grad_norm": 4.4692930536610245, "language_loss": 0.75825363, "learning_rate": 3.280612661141615e-06, "loss": 0.7798208, "num_input_tokens_seen": 107520070, "step": 4997, "time_per_iteration": 2.733402967453003 }, { "auxiliary_loss_clip": 0.01131118, "auxiliary_loss_mlp": 0.0104737, "balance_loss_clip": 1.05176449, "balance_loss_mlp": 1.03149128, "epoch": 0.30049601683451077, "flos": 20995892208000.0, "grad_norm": 2.0588160995259197, "language_loss": 0.78425241, "learning_rate": 3.2803134836909646e-06, "loss": 0.80603731, "num_input_tokens_seen": 107539285, "step": 4998, "time_per_iteration": 2.7973837852478027 }, { "auxiliary_loss_clip": 0.011392, "auxiliary_loss_mlp": 0.01044927, "balance_loss_clip": 1.05180395, "balance_loss_mlp": 1.0287745, "epoch": 0.30055614008717874, "flos": 23916004272000.0, "grad_norm": 18.871291300313036, "language_loss": 0.73622382, "learning_rate": 3.2800142576906985e-06, "loss": 0.7580651, "num_input_tokens_seen": 107560260, "step": 4999, "time_per_iteration": 2.7197916507720947 }, { "auxiliary_loss_clip": 0.01131684, "auxiliary_loss_mlp": 0.01044515, "balance_loss_clip": 1.05033612, "balance_loss_mlp": 1.02750361, "epoch": 0.3006162633398467, "flos": 19169798630400.0, "grad_norm": 1.6090337016392804, "language_loss": 0.75454789, "learning_rate": 3.2797149831521626e-06, "loss": 0.77630985, "num_input_tokens_seen": 107579260, "step": 5000, "time_per_iteration": 2.688054323196411 }, { "auxiliary_loss_clip": 0.01138443, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.0505259, "balance_loss_mlp": 1.02564812, "epoch": 0.30067638659251467, "flos": 14679241061760.0, "grad_norm": 1.7985326326547535, "language_loss": 0.81841409, "learning_rate": 3.2794156600867073e-06, "loss": 0.84020931, "num_input_tokens_seen": 107595245, "step": 5001, "time_per_iteration": 2.6519837379455566 }, { "auxiliary_loss_clip": 0.01128756, "auxiliary_loss_mlp": 0.01048602, "balance_loss_clip": 1.05139947, "balance_loss_mlp": 1.03068447, "epoch": 0.30073650984518263, "flos": 23368007404800.0, "grad_norm": 1.8684342377814658, "language_loss": 0.7999261, "learning_rate": 3.2791162885056815e-06, "loss": 0.82169974, "num_input_tokens_seen": 107613985, "step": 5002, "time_per_iteration": 2.6749327182769775 }, { "auxiliary_loss_clip": 0.01091983, "auxiliary_loss_mlp": 0.0104282, "balance_loss_clip": 1.04869151, "balance_loss_mlp": 1.02431834, "epoch": 0.3007966330978506, "flos": 22966633854720.0, "grad_norm": 1.9577039368374018, "language_loss": 0.70993537, "learning_rate": 3.2788168684204376e-06, "loss": 0.73128337, "num_input_tokens_seen": 107631435, "step": 5003, "time_per_iteration": 2.908494472503662 }, { "auxiliary_loss_clip": 0.01110546, "auxiliary_loss_mlp": 0.01043883, "balance_loss_clip": 1.05014396, "balance_loss_mlp": 1.02643037, "epoch": 0.30085675635051856, "flos": 27818452460160.0, "grad_norm": 1.956987555909332, "language_loss": 0.70556092, "learning_rate": 3.27851739984233e-06, "loss": 0.72710526, "num_input_tokens_seen": 107650530, "step": 5004, "time_per_iteration": 2.8064236640930176 }, { "auxiliary_loss_clip": 0.01119172, "auxiliary_loss_mlp": 0.01045143, "balance_loss_clip": 1.05067444, "balance_loss_mlp": 1.02800083, "epoch": 0.3009168796031865, "flos": 10882729059840.0, "grad_norm": 2.8453259041050805, "language_loss": 0.81459486, "learning_rate": 3.278217882782715e-06, "loss": 0.83623803, "num_input_tokens_seen": 107662240, "step": 5005, "time_per_iteration": 2.633951425552368 }, { "auxiliary_loss_clip": 0.01130639, "auxiliary_loss_mlp": 0.01043853, "balance_loss_clip": 1.0514015, "balance_loss_mlp": 1.02742577, "epoch": 0.3009770028558545, "flos": 23805399317760.0, "grad_norm": 3.7156546302240043, "language_loss": 0.74672973, "learning_rate": 3.2779183172529497e-06, "loss": 0.76847464, "num_input_tokens_seen": 107680330, "step": 5006, "time_per_iteration": 2.7556662559509277 }, { "auxiliary_loss_clip": 0.01101239, "auxiliary_loss_mlp": 0.00775371, "balance_loss_clip": 1.04850578, "balance_loss_mlp": 1.00104856, "epoch": 0.30103712610852246, "flos": 26468211283200.0, "grad_norm": 2.0504029481480153, "language_loss": 0.71090448, "learning_rate": 3.2776187032643932e-06, "loss": 0.72967064, "num_input_tokens_seen": 107700020, "step": 5007, "time_per_iteration": 2.83591365814209 }, { "auxiliary_loss_clip": 0.01129575, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.05173922, "balance_loss_mlp": 1.0206027, "epoch": 0.3010972493611904, "flos": 22856459863680.0, "grad_norm": 2.302333802055736, "language_loss": 0.76504552, "learning_rate": 3.2773190408284075e-06, "loss": 0.78672242, "num_input_tokens_seen": 107718575, "step": 5008, "time_per_iteration": 2.7624082565307617 }, { "auxiliary_loss_clip": 0.0112694, "auxiliary_loss_mlp": 0.01039735, "balance_loss_clip": 1.05119205, "balance_loss_mlp": 1.02284265, "epoch": 0.3011573726138584, "flos": 24053685102720.0, "grad_norm": 1.840633361886899, "language_loss": 0.84215975, "learning_rate": 3.2770193299563564e-06, "loss": 0.86382657, "num_input_tokens_seen": 107738635, "step": 5009, "time_per_iteration": 2.7053475379943848 }, { "auxiliary_loss_clip": 0.01135722, "auxiliary_loss_mlp": 0.0104281, "balance_loss_clip": 1.05079174, "balance_loss_mlp": 1.02389145, "epoch": 0.30121749586652635, "flos": 20259687052800.0, "grad_norm": 1.970244045667646, "language_loss": 0.83804011, "learning_rate": 3.276719570659604e-06, "loss": 0.85982549, "num_input_tokens_seen": 107753415, "step": 5010, "time_per_iteration": 2.677002429962158 }, { "auxiliary_loss_clip": 0.01108582, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.04942024, "balance_loss_mlp": 1.02294374, "epoch": 0.3012776191191944, "flos": 26943058103040.0, "grad_norm": 2.3216326772862246, "language_loss": 0.85401523, "learning_rate": 3.2764197629495176e-06, "loss": 0.87548327, "num_input_tokens_seen": 107773840, "step": 5011, "time_per_iteration": 2.807887077331543 }, { "auxiliary_loss_clip": 0.01119452, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.04522014, "balance_loss_mlp": 1.02680194, "epoch": 0.30133774237186234, "flos": 20412307941120.0, "grad_norm": 2.58081844210284, "language_loss": 0.72122502, "learning_rate": 3.2761199068374656e-06, "loss": 0.74286604, "num_input_tokens_seen": 107792020, "step": 5012, "time_per_iteration": 2.689375400543213 }, { "auxiliary_loss_clip": 0.01127162, "auxiliary_loss_mlp": 0.01042946, "balance_loss_clip": 1.04826403, "balance_loss_mlp": 1.02628016, "epoch": 0.3013978656245303, "flos": 19792453916160.0, "grad_norm": 2.871668468467944, "language_loss": 0.88278735, "learning_rate": 3.275820002334819e-06, "loss": 0.90448833, "num_input_tokens_seen": 107809595, "step": 5013, "time_per_iteration": 2.6482350826263428 }, { "auxiliary_loss_clip": 0.01110184, "auxiliary_loss_mlp": 0.01050326, "balance_loss_clip": 1.04318821, "balance_loss_mlp": 1.0286417, "epoch": 0.30145798887719827, "flos": 16249650652800.0, "grad_norm": 1.8756845710135603, "language_loss": 0.82593644, "learning_rate": 3.2755200494529496e-06, "loss": 0.84754151, "num_input_tokens_seen": 107827230, "step": 5014, "time_per_iteration": 2.6681008338928223 }, { "auxiliary_loss_clip": 0.01092673, "auxiliary_loss_mlp": 0.01047692, "balance_loss_clip": 1.04461288, "balance_loss_mlp": 1.03045392, "epoch": 0.30151811212986623, "flos": 24571733005440.0, "grad_norm": 1.7101695757694795, "language_loss": 0.68239003, "learning_rate": 3.2752200482032323e-06, "loss": 0.7037937, "num_input_tokens_seen": 107847195, "step": 5015, "time_per_iteration": 2.725411891937256 }, { "auxiliary_loss_clip": 0.01110447, "auxiliary_loss_mlp": 0.01043819, "balance_loss_clip": 1.0448432, "balance_loss_mlp": 1.02652168, "epoch": 0.3015782353825342, "flos": 21872076664320.0, "grad_norm": 2.2766913154728625, "language_loss": 0.74497074, "learning_rate": 3.2749199985970436e-06, "loss": 0.76651341, "num_input_tokens_seen": 107866420, "step": 5016, "time_per_iteration": 2.710721492767334 }, { "auxiliary_loss_clip": 0.01133464, "auxiliary_loss_mlp": 0.01041604, "balance_loss_clip": 1.05026031, "balance_loss_mlp": 1.02444994, "epoch": 0.30163835863520216, "flos": 28769331248640.0, "grad_norm": 1.7847015072033203, "language_loss": 0.65504754, "learning_rate": 3.2746199006457603e-06, "loss": 0.67679822, "num_input_tokens_seen": 107889090, "step": 5017, "time_per_iteration": 2.7239317893981934 }, { "auxiliary_loss_clip": 0.01091977, "auxiliary_loss_mlp": 0.01057247, "balance_loss_clip": 1.04233074, "balance_loss_mlp": 1.03813791, "epoch": 0.30169848188787013, "flos": 22966202891520.0, "grad_norm": 2.1696927992492783, "language_loss": 0.68739498, "learning_rate": 3.2743197543607628e-06, "loss": 0.70888722, "num_input_tokens_seen": 107907520, "step": 5018, "time_per_iteration": 2.6655359268188477 }, { "auxiliary_loss_clip": 0.01135218, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.0482893, "balance_loss_mlp": 1.02783799, "epoch": 0.3017586051405381, "flos": 21835268202240.0, "grad_norm": 1.9457029488983892, "language_loss": 0.78853333, "learning_rate": 3.2740195597534327e-06, "loss": 0.8103134, "num_input_tokens_seen": 107925650, "step": 5019, "time_per_iteration": 2.669679641723633 }, { "auxiliary_loss_clip": 0.01112458, "auxiliary_loss_mlp": 0.01044161, "balance_loss_clip": 1.04863656, "balance_loss_mlp": 1.02766263, "epoch": 0.30181872839320606, "flos": 22160403135360.0, "grad_norm": 3.674249330665847, "language_loss": 0.70038712, "learning_rate": 3.2737193168351527e-06, "loss": 0.72195333, "num_input_tokens_seen": 107943975, "step": 5020, "time_per_iteration": 2.704000234603882 }, { "auxiliary_loss_clip": 0.01143422, "auxiliary_loss_mlp": 0.01049684, "balance_loss_clip": 1.05071819, "balance_loss_mlp": 1.03320909, "epoch": 0.301878851645874, "flos": 18114168804480.0, "grad_norm": 5.641410405732297, "language_loss": 0.78549969, "learning_rate": 3.2734190256173085e-06, "loss": 0.80743068, "num_input_tokens_seen": 107962950, "step": 5021, "time_per_iteration": 4.521278142929077 }, { "auxiliary_loss_clip": 0.01129372, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.04859924, "balance_loss_mlp": 1.01572752, "epoch": 0.301938974898542, "flos": 17602226213760.0, "grad_norm": 3.308202374048827, "language_loss": 0.75482392, "learning_rate": 3.2731186861112877e-06, "loss": 0.77643561, "num_input_tokens_seen": 107979700, "step": 5022, "time_per_iteration": 4.1478235721588135 }, { "auxiliary_loss_clip": 0.01141828, "auxiliary_loss_mlp": 0.01043797, "balance_loss_clip": 1.04905522, "balance_loss_mlp": 1.02676249, "epoch": 0.30199909815120995, "flos": 11181219079680.0, "grad_norm": 1.7715139184612991, "language_loss": 0.69534874, "learning_rate": 3.2728182983284793e-06, "loss": 0.71720505, "num_input_tokens_seen": 107996645, "step": 5023, "time_per_iteration": 2.582491636276245 }, { "auxiliary_loss_clip": 0.01112614, "auxiliary_loss_mlp": 0.01040881, "balance_loss_clip": 1.04434311, "balance_loss_mlp": 1.02471602, "epoch": 0.302059221403878, "flos": 21907843632000.0, "grad_norm": 4.128865002464027, "language_loss": 0.71400636, "learning_rate": 3.2725178622802724e-06, "loss": 0.73554134, "num_input_tokens_seen": 108015020, "step": 5024, "time_per_iteration": 2.6789708137512207 }, { "auxiliary_loss_clip": 0.01125475, "auxiliary_loss_mlp": 0.01051317, "balance_loss_clip": 1.04789031, "balance_loss_mlp": 1.03441346, "epoch": 0.30211934465654594, "flos": 26396390039040.0, "grad_norm": 2.5352325664815396, "language_loss": 0.73949707, "learning_rate": 3.272217377978061e-06, "loss": 0.76126498, "num_input_tokens_seen": 108036430, "step": 5025, "time_per_iteration": 2.7021281719207764 }, { "auxiliary_loss_clip": 0.01129438, "auxiliary_loss_mlp": 0.01049255, "balance_loss_clip": 1.05115628, "balance_loss_mlp": 1.03333473, "epoch": 0.3021794679092139, "flos": 23400470321280.0, "grad_norm": 1.5312912087399582, "language_loss": 0.67339373, "learning_rate": 3.2719168454332387e-06, "loss": 0.69518065, "num_input_tokens_seen": 108054250, "step": 5026, "time_per_iteration": 4.172817230224609 }, { "auxiliary_loss_clip": 0.01131398, "auxiliary_loss_mlp": 0.01045765, "balance_loss_clip": 1.05058789, "balance_loss_mlp": 1.02871835, "epoch": 0.30223959116188187, "flos": 20260979942400.0, "grad_norm": 1.8656003857402752, "language_loss": 0.84821522, "learning_rate": 3.2716162646572034e-06, "loss": 0.86998689, "num_input_tokens_seen": 108071495, "step": 5027, "time_per_iteration": 2.66186785697937 }, { "auxiliary_loss_clip": 0.01104085, "auxiliary_loss_mlp": 0.01045706, "balance_loss_clip": 1.04686451, "balance_loss_mlp": 1.03030431, "epoch": 0.30229971441454984, "flos": 26687840993280.0, "grad_norm": 1.633485895123786, "language_loss": 0.78574622, "learning_rate": 3.271315635661351e-06, "loss": 0.80724418, "num_input_tokens_seen": 108092135, "step": 5028, "time_per_iteration": 4.454678297042847 }, { "auxiliary_loss_clip": 0.01113383, "auxiliary_loss_mlp": 0.01048022, "balance_loss_clip": 1.04682207, "balance_loss_mlp": 1.03115392, "epoch": 0.3023598376672178, "flos": 34345323953280.0, "grad_norm": 1.9340935936746968, "language_loss": 0.77085543, "learning_rate": 3.2710149584570826e-06, "loss": 0.79246956, "num_input_tokens_seen": 108112945, "step": 5029, "time_per_iteration": 2.841707229614258 }, { "auxiliary_loss_clip": 0.01111921, "auxiliary_loss_mlp": 0.01048937, "balance_loss_clip": 1.04846191, "balance_loss_mlp": 1.02920818, "epoch": 0.30241996091988577, "flos": 23112143850240.0, "grad_norm": 2.1432001376374257, "language_loss": 0.8240397, "learning_rate": 3.2707142330557993e-06, "loss": 0.84564829, "num_input_tokens_seen": 108130325, "step": 5030, "time_per_iteration": 2.8557751178741455 }, { "auxiliary_loss_clip": 0.01090897, "auxiliary_loss_mlp": 0.00775419, "balance_loss_clip": 1.04519463, "balance_loss_mlp": 1.00112486, "epoch": 0.30248008417255373, "flos": 19390002958080.0, "grad_norm": 2.2374457582531098, "language_loss": 0.6987617, "learning_rate": 3.270413459468905e-06, "loss": 0.71742487, "num_input_tokens_seen": 108150300, "step": 5031, "time_per_iteration": 2.7827746868133545 }, { "auxiliary_loss_clip": 0.01121676, "auxiliary_loss_mlp": 0.01044463, "balance_loss_clip": 1.04549253, "balance_loss_mlp": 1.02800059, "epoch": 0.3025402074252217, "flos": 23769704177280.0, "grad_norm": 1.8685207024800563, "language_loss": 0.82324117, "learning_rate": 3.2701126377078047e-06, "loss": 0.84490258, "num_input_tokens_seen": 108170330, "step": 5032, "time_per_iteration": 2.6529927253723145 }, { "auxiliary_loss_clip": 0.01104945, "auxiliary_loss_mlp": 0.01059072, "balance_loss_clip": 1.05129266, "balance_loss_mlp": 1.03951025, "epoch": 0.30260033067788966, "flos": 25994118648960.0, "grad_norm": 2.130148669813867, "language_loss": 0.73156881, "learning_rate": 3.269811767783906e-06, "loss": 0.75320899, "num_input_tokens_seen": 108191265, "step": 5033, "time_per_iteration": 2.7259597778320312 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.01049397, "balance_loss_clip": 1.04687023, "balance_loss_mlp": 1.03221893, "epoch": 0.3026604539305576, "flos": 25374551932800.0, "grad_norm": 1.564237149834404, "language_loss": 0.74164939, "learning_rate": 3.2695108497086185e-06, "loss": 0.76338559, "num_input_tokens_seen": 108211615, "step": 5034, "time_per_iteration": 2.674745798110962 }, { "auxiliary_loss_clip": 0.01140313, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.04939198, "balance_loss_mlp": 1.02224064, "epoch": 0.3027205771832256, "flos": 25812733944960.0, "grad_norm": 1.8295549596836873, "language_loss": 0.72133434, "learning_rate": 3.269209883493352e-06, "loss": 0.74312872, "num_input_tokens_seen": 108231080, "step": 5035, "time_per_iteration": 2.6429855823516846 }, { "auxiliary_loss_clip": 0.01123118, "auxiliary_loss_mlp": 0.01038432, "balance_loss_clip": 1.04499483, "balance_loss_mlp": 1.02267289, "epoch": 0.30278070043589356, "flos": 27344539393920.0, "grad_norm": 2.468501372591198, "language_loss": 0.86918867, "learning_rate": 3.2689088691495196e-06, "loss": 0.89080417, "num_input_tokens_seen": 108251125, "step": 5036, "time_per_iteration": 2.6735007762908936 }, { "auxiliary_loss_clip": 0.01097642, "auxiliary_loss_mlp": 0.01051442, "balance_loss_clip": 1.04504728, "balance_loss_mlp": 1.0331912, "epoch": 0.3028408236885616, "flos": 24786227070720.0, "grad_norm": 2.859596651876304, "language_loss": 0.77406383, "learning_rate": 3.268607806688536e-06, "loss": 0.79555464, "num_input_tokens_seen": 108272545, "step": 5037, "time_per_iteration": 2.7311182022094727 }, { "auxiliary_loss_clip": 0.01102304, "auxiliary_loss_mlp": 0.01044604, "balance_loss_clip": 1.0462358, "balance_loss_mlp": 1.02683008, "epoch": 0.30290094694122954, "flos": 12932474670720.0, "grad_norm": 2.32450780354164, "language_loss": 0.77307165, "learning_rate": 3.268306696121816e-06, "loss": 0.79454064, "num_input_tokens_seen": 108289725, "step": 5038, "time_per_iteration": 2.677525043487549 }, { "auxiliary_loss_clip": 0.01113965, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.04819584, "balance_loss_mlp": 1.02067804, "epoch": 0.3029610701938975, "flos": 25916443488000.0, "grad_norm": 2.1234468188232976, "language_loss": 0.74140579, "learning_rate": 3.2680055374607804e-06, "loss": 0.76291645, "num_input_tokens_seen": 108310690, "step": 5039, "time_per_iteration": 2.7086853981018066 }, { "auxiliary_loss_clip": 0.01137739, "auxiliary_loss_mlp": 0.00774651, "balance_loss_clip": 1.05068994, "balance_loss_mlp": 1.00113058, "epoch": 0.3030211934465655, "flos": 21980993679360.0, "grad_norm": 2.3826017374700372, "language_loss": 0.79777801, "learning_rate": 3.267704330716847e-06, "loss": 0.81690192, "num_input_tokens_seen": 108328905, "step": 5040, "time_per_iteration": 2.665175199508667 }, { "auxiliary_loss_clip": 0.01114198, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.04937124, "balance_loss_mlp": 1.02279687, "epoch": 0.30308131669923344, "flos": 20991977625600.0, "grad_norm": 1.7800027985776907, "language_loss": 0.81872481, "learning_rate": 3.267403075901438e-06, "loss": 0.84024912, "num_input_tokens_seen": 108346680, "step": 5041, "time_per_iteration": 2.6471712589263916 }, { "auxiliary_loss_clip": 0.01018002, "auxiliary_loss_mlp": 0.01004656, "balance_loss_clip": 1.0244385, "balance_loss_mlp": 1.00277221, "epoch": 0.3031414399519014, "flos": 60548875827840.0, "grad_norm": 0.7715538683836823, "language_loss": 0.59505904, "learning_rate": 3.267101773025978e-06, "loss": 0.61528552, "num_input_tokens_seen": 108413885, "step": 5042, "time_per_iteration": 3.3167309761047363 }, { "auxiliary_loss_clip": 0.0114486, "auxiliary_loss_mlp": 0.01036647, "balance_loss_clip": 1.05319929, "balance_loss_mlp": 1.01940918, "epoch": 0.30320156320456937, "flos": 21907664064000.0, "grad_norm": 1.838538817411587, "language_loss": 0.71149278, "learning_rate": 3.266800422101892e-06, "loss": 0.73330784, "num_input_tokens_seen": 108433640, "step": 5043, "time_per_iteration": 2.6266753673553467 }, { "auxiliary_loss_clip": 0.01095086, "auxiliary_loss_mlp": 0.01036293, "balance_loss_clip": 1.04519725, "balance_loss_mlp": 1.01948404, "epoch": 0.30326168645723733, "flos": 21652770176640.0, "grad_norm": 3.620919115388089, "language_loss": 0.69573802, "learning_rate": 3.266499023140606e-06, "loss": 0.71705186, "num_input_tokens_seen": 108452640, "step": 5044, "time_per_iteration": 2.7561492919921875 }, { "auxiliary_loss_clip": 0.01127659, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.05019724, "balance_loss_mlp": 1.02335382, "epoch": 0.3033218097099053, "flos": 21871286565120.0, "grad_norm": 1.3797061223764004, "language_loss": 0.77188826, "learning_rate": 3.2661975761535513e-06, "loss": 0.79356289, "num_input_tokens_seen": 108472470, "step": 5045, "time_per_iteration": 2.6529667377471924 }, { "auxiliary_loss_clip": 0.01141388, "auxiliary_loss_mlp": 0.00775246, "balance_loss_clip": 1.05165195, "balance_loss_mlp": 1.00136316, "epoch": 0.30338193296257326, "flos": 27089717333760.0, "grad_norm": 1.772786200303907, "language_loss": 0.72473782, "learning_rate": 3.2658960811521564e-06, "loss": 0.74390417, "num_input_tokens_seen": 108493025, "step": 5046, "time_per_iteration": 2.8433380126953125 }, { "auxiliary_loss_clip": 0.01131475, "auxiliary_loss_mlp": 0.01040342, "balance_loss_clip": 1.04979491, "balance_loss_mlp": 1.02119732, "epoch": 0.30344205621524123, "flos": 19534363718400.0, "grad_norm": 1.7729778222487513, "language_loss": 0.81406343, "learning_rate": 3.2655945381478564e-06, "loss": 0.83578163, "num_input_tokens_seen": 108513480, "step": 5047, "time_per_iteration": 2.6653506755828857 }, { "auxiliary_loss_clip": 0.01078955, "auxiliary_loss_mlp": 0.01042974, "balance_loss_clip": 1.04126537, "balance_loss_mlp": 1.02565265, "epoch": 0.3035021794679092, "flos": 23910976368000.0, "grad_norm": 2.0012909108595287, "language_loss": 0.7191782, "learning_rate": 3.265292947152084e-06, "loss": 0.74039751, "num_input_tokens_seen": 108533155, "step": 5048, "time_per_iteration": 2.7198410034179688 }, { "auxiliary_loss_clip": 0.01117557, "auxiliary_loss_mlp": 0.01037944, "balance_loss_clip": 1.04860258, "balance_loss_mlp": 1.02263796, "epoch": 0.30356230272057716, "flos": 16143606725760.0, "grad_norm": 1.6260333435769418, "language_loss": 0.75220919, "learning_rate": 3.2649913081762763e-06, "loss": 0.77376425, "num_input_tokens_seen": 108551900, "step": 5049, "time_per_iteration": 2.6649906635284424 }, { "auxiliary_loss_clip": 0.01131404, "auxiliary_loss_mlp": 0.01035526, "balance_loss_clip": 1.04947305, "balance_loss_mlp": 1.01907563, "epoch": 0.3036224259732452, "flos": 28914697589760.0, "grad_norm": 1.5855456549340856, "language_loss": 0.82088244, "learning_rate": 3.2646896212318717e-06, "loss": 0.84255171, "num_input_tokens_seen": 108574005, "step": 5050, "time_per_iteration": 2.657400131225586 }, { "auxiliary_loss_clip": 0.01106158, "auxiliary_loss_mlp": 0.0103828, "balance_loss_clip": 1.05031502, "balance_loss_mlp": 1.02079201, "epoch": 0.30368254922591315, "flos": 21105599322240.0, "grad_norm": 2.7844840544166436, "language_loss": 0.74196702, "learning_rate": 3.2643878863303106e-06, "loss": 0.7634114, "num_input_tokens_seen": 108592715, "step": 5051, "time_per_iteration": 2.8018569946289062 }, { "auxiliary_loss_clip": 0.01079332, "auxiliary_loss_mlp": 0.00775567, "balance_loss_clip": 1.04338145, "balance_loss_mlp": 1.00118661, "epoch": 0.3037426724785811, "flos": 23002293081600.0, "grad_norm": 1.6849730779493737, "language_loss": 0.76015687, "learning_rate": 3.264086103483033e-06, "loss": 0.77870589, "num_input_tokens_seen": 108611770, "step": 5052, "time_per_iteration": 2.9220657348632812 }, { "auxiliary_loss_clip": 0.01143047, "auxiliary_loss_mlp": 0.01043624, "balance_loss_clip": 1.0504849, "balance_loss_mlp": 1.02656555, "epoch": 0.3038027957312491, "flos": 15632705629440.0, "grad_norm": 2.421175308310746, "language_loss": 0.82370055, "learning_rate": 3.2637842727014836e-06, "loss": 0.84556723, "num_input_tokens_seen": 108629070, "step": 5053, "time_per_iteration": 2.5955326557159424 }, { "auxiliary_loss_clip": 0.01113702, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.0471338, "balance_loss_mlp": 1.02475214, "epoch": 0.30386291898391704, "flos": 12713994195840.0, "grad_norm": 1.8307418288785484, "language_loss": 0.70979112, "learning_rate": 3.2634823939971083e-06, "loss": 0.73134822, "num_input_tokens_seen": 108646315, "step": 5054, "time_per_iteration": 2.7001569271087646 }, { "auxiliary_loss_clip": 0.01140964, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.05088401, "balance_loss_mlp": 1.0225668, "epoch": 0.303923042236585, "flos": 26359437922560.0, "grad_norm": 2.314538095600907, "language_loss": 0.69049591, "learning_rate": 3.2631804673813545e-06, "loss": 0.71230358, "num_input_tokens_seen": 108665920, "step": 5055, "time_per_iteration": 2.6685287952423096 }, { "auxiliary_loss_clip": 0.01113325, "auxiliary_loss_mlp": 0.01036352, "balance_loss_clip": 1.04871488, "balance_loss_mlp": 1.01880479, "epoch": 0.30398316548925297, "flos": 19719232041600.0, "grad_norm": 1.959915959447654, "language_loss": 0.67298615, "learning_rate": 3.2628784928656707e-06, "loss": 0.69448292, "num_input_tokens_seen": 108683485, "step": 5056, "time_per_iteration": 2.6933648586273193 }, { "auxiliary_loss_clip": 0.01110454, "auxiliary_loss_mlp": 0.01043223, "balance_loss_clip": 1.04604077, "balance_loss_mlp": 1.02673686, "epoch": 0.30404328874192094, "flos": 24239846315520.0, "grad_norm": 1.7045430221851803, "language_loss": 0.82544303, "learning_rate": 3.262576470461507e-06, "loss": 0.84697986, "num_input_tokens_seen": 108702700, "step": 5057, "time_per_iteration": 2.740187406539917 }, { "auxiliary_loss_clip": 0.01115402, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.04719019, "balance_loss_mlp": 1.0222472, "epoch": 0.3041034119945889, "flos": 24498942094080.0, "grad_norm": 1.8459128585017135, "language_loss": 0.88849652, "learning_rate": 3.2622744001803176e-06, "loss": 0.91004193, "num_input_tokens_seen": 108721860, "step": 5058, "time_per_iteration": 2.7015340328216553 }, { "auxiliary_loss_clip": 0.01102971, "auxiliary_loss_mlp": 0.01047692, "balance_loss_clip": 1.04598641, "balance_loss_mlp": 1.03040063, "epoch": 0.30416353524725687, "flos": 28288881907200.0, "grad_norm": 7.837576661900421, "language_loss": 0.71809238, "learning_rate": 3.2619722820335564e-06, "loss": 0.73959899, "num_input_tokens_seen": 108743215, "step": 5059, "time_per_iteration": 2.7542827129364014 }, { "auxiliary_loss_clip": 0.01083101, "auxiliary_loss_mlp": 0.01042605, "balance_loss_clip": 1.04435182, "balance_loss_mlp": 1.02670228, "epoch": 0.30422365849992483, "flos": 23660392112640.0, "grad_norm": 2.424944175434462, "language_loss": 0.73316336, "learning_rate": 3.26167011603268e-06, "loss": 0.7544204, "num_input_tokens_seen": 108765505, "step": 5060, "time_per_iteration": 4.655209541320801 }, { "auxiliary_loss_clip": 0.01140365, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.05072367, "balance_loss_mlp": 1.02234221, "epoch": 0.3042837817525928, "flos": 22998773548800.0, "grad_norm": 2.6284704346086, "language_loss": 0.77279079, "learning_rate": 3.2613679021891463e-06, "loss": 0.79457664, "num_input_tokens_seen": 108783370, "step": 5061, "time_per_iteration": 4.1857099533081055 }, { "auxiliary_loss_clip": 0.01105214, "auxiliary_loss_mlp": 0.01039505, "balance_loss_clip": 1.05216312, "balance_loss_mlp": 1.02225542, "epoch": 0.30434390500526076, "flos": 22082332924800.0, "grad_norm": 1.9238999634605745, "language_loss": 0.81891274, "learning_rate": 3.261065640514415e-06, "loss": 0.84035993, "num_input_tokens_seen": 108797430, "step": 5062, "time_per_iteration": 2.7250373363494873 }, { "auxiliary_loss_clip": 0.01132809, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.04662633, "balance_loss_mlp": 1.02098203, "epoch": 0.3044040282579287, "flos": 25483504861440.0, "grad_norm": 1.8479376829176948, "language_loss": 0.74707627, "learning_rate": 3.2607633310199483e-06, "loss": 0.76876783, "num_input_tokens_seen": 108816945, "step": 5063, "time_per_iteration": 2.6387155055999756 }, { "auxiliary_loss_clip": 0.01126143, "auxiliary_loss_mlp": 0.00775405, "balance_loss_clip": 1.04923415, "balance_loss_mlp": 1.00135541, "epoch": 0.30446415151059675, "flos": 21945478106880.0, "grad_norm": 1.691336757602503, "language_loss": 0.84400523, "learning_rate": 3.26046097371721e-06, "loss": 0.86302078, "num_input_tokens_seen": 108836615, "step": 5064, "time_per_iteration": 2.645256519317627 }, { "auxiliary_loss_clip": 0.01125608, "auxiliary_loss_mlp": 0.01040172, "balance_loss_clip": 1.04725182, "balance_loss_mlp": 1.02311337, "epoch": 0.3045242747632647, "flos": 16435416816000.0, "grad_norm": 2.198572989748056, "language_loss": 0.76257896, "learning_rate": 3.2601585686176655e-06, "loss": 0.78423673, "num_input_tokens_seen": 108855165, "step": 5065, "time_per_iteration": 4.119553565979004 }, { "auxiliary_loss_clip": 0.01110206, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.04441273, "balance_loss_mlp": 1.0260098, "epoch": 0.3045843980159327, "flos": 31540341957120.0, "grad_norm": 1.985168773674731, "language_loss": 0.62328786, "learning_rate": 3.2598561157327814e-06, "loss": 0.64482433, "num_input_tokens_seen": 108874690, "step": 5066, "time_per_iteration": 4.380331516265869 }, { "auxiliary_loss_clip": 0.01112307, "auxiliary_loss_mlp": 0.0104907, "balance_loss_clip": 1.04790235, "balance_loss_mlp": 1.03186774, "epoch": 0.30464452126860064, "flos": 17853636481920.0, "grad_norm": 2.188592288059769, "language_loss": 0.83193344, "learning_rate": 3.2595536150740265e-06, "loss": 0.85354722, "num_input_tokens_seen": 108893140, "step": 5067, "time_per_iteration": 2.628598213195801 }, { "auxiliary_loss_clip": 0.01136833, "auxiliary_loss_mlp": 0.01045137, "balance_loss_clip": 1.04994464, "balance_loss_mlp": 1.02904344, "epoch": 0.3047046445212686, "flos": 20631398947200.0, "grad_norm": 4.883769852075586, "language_loss": 0.62878895, "learning_rate": 3.259251066652873e-06, "loss": 0.65060866, "num_input_tokens_seen": 108911880, "step": 5068, "time_per_iteration": 2.583193302154541 }, { "auxiliary_loss_clip": 0.01127244, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.04866779, "balance_loss_mlp": 1.02316117, "epoch": 0.3047647677739366, "flos": 21287594557440.0, "grad_norm": 4.297243307498397, "language_loss": 0.74780715, "learning_rate": 3.258948470480793e-06, "loss": 0.7694723, "num_input_tokens_seen": 108930440, "step": 5069, "time_per_iteration": 2.643608570098877 }, { "auxiliary_loss_clip": 0.01103787, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.04608154, "balance_loss_mlp": 1.02922475, "epoch": 0.30482489102660454, "flos": 20995928121600.0, "grad_norm": 1.9753352797934713, "language_loss": 0.75726902, "learning_rate": 3.258645826569261e-06, "loss": 0.77875942, "num_input_tokens_seen": 108949125, "step": 5070, "time_per_iteration": 2.715672016143799 }, { "auxiliary_loss_clip": 0.01140483, "auxiliary_loss_mlp": 0.0077507, "balance_loss_clip": 1.04843533, "balance_loss_mlp": 1.0012939, "epoch": 0.3048850142792725, "flos": 26290812988800.0, "grad_norm": 1.7281078039111346, "language_loss": 0.81636953, "learning_rate": 3.2583431349297527e-06, "loss": 0.83552504, "num_input_tokens_seen": 108972190, "step": 5071, "time_per_iteration": 2.635542869567871 }, { "auxiliary_loss_clip": 0.01108476, "auxiliary_loss_mlp": 0.01045674, "balance_loss_clip": 1.04286063, "balance_loss_mlp": 1.02776885, "epoch": 0.30494513753194047, "flos": 22346241125760.0, "grad_norm": 2.0085610287172173, "language_loss": 0.76208484, "learning_rate": 3.2580403955737467e-06, "loss": 0.78362632, "num_input_tokens_seen": 108990325, "step": 5072, "time_per_iteration": 2.6662180423736572 }, { "auxiliary_loss_clip": 0.01099158, "auxiliary_loss_mlp": 0.01044752, "balance_loss_clip": 1.04694605, "balance_loss_mlp": 1.02821743, "epoch": 0.30500526078460843, "flos": 19537667769600.0, "grad_norm": 1.8424983506970039, "language_loss": 0.70873296, "learning_rate": 3.257737608512723e-06, "loss": 0.7301721, "num_input_tokens_seen": 109009505, "step": 5073, "time_per_iteration": 2.815281867980957 }, { "auxiliary_loss_clip": 0.01133011, "auxiliary_loss_mlp": 0.01055026, "balance_loss_clip": 1.05032837, "balance_loss_mlp": 1.03757334, "epoch": 0.3050653840372764, "flos": 14465321614080.0, "grad_norm": 2.0666195830085434, "language_loss": 0.76370406, "learning_rate": 3.257434773758163e-06, "loss": 0.78558439, "num_input_tokens_seen": 109026350, "step": 5074, "time_per_iteration": 2.748568534851074 }, { "auxiliary_loss_clip": 0.01115721, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.04921389, "balance_loss_mlp": 1.02149391, "epoch": 0.30512550728994436, "flos": 24243796811520.0, "grad_norm": 1.8649350467458667, "language_loss": 0.74393201, "learning_rate": 3.25713189132155e-06, "loss": 0.76546526, "num_input_tokens_seen": 109044165, "step": 5075, "time_per_iteration": 2.7015154361724854 }, { "auxiliary_loss_clip": 0.01141745, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.0498178, "balance_loss_mlp": 1.02825916, "epoch": 0.30518563054261233, "flos": 16360542915840.0, "grad_norm": 2.030111139920667, "language_loss": 0.75904357, "learning_rate": 3.2568289612143703e-06, "loss": 0.78093445, "num_input_tokens_seen": 109060665, "step": 5076, "time_per_iteration": 2.5811965465545654 }, { "auxiliary_loss_clip": 0.01116901, "auxiliary_loss_mlp": 0.01040641, "balance_loss_clip": 1.04864156, "balance_loss_mlp": 1.02466679, "epoch": 0.30524575379528035, "flos": 21579584215680.0, "grad_norm": 1.6479970241835653, "language_loss": 0.79240596, "learning_rate": 3.25652598344811e-06, "loss": 0.81398141, "num_input_tokens_seen": 109080035, "step": 5077, "time_per_iteration": 2.680205821990967 }, { "auxiliary_loss_clip": 0.01087088, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.04356635, "balance_loss_mlp": 1.01881564, "epoch": 0.3053058770479483, "flos": 16545231671040.0, "grad_norm": 1.6765288024346336, "language_loss": 0.74525034, "learning_rate": 3.256222958034259e-06, "loss": 0.76645821, "num_input_tokens_seen": 109097385, "step": 5078, "time_per_iteration": 2.7247111797332764 }, { "auxiliary_loss_clip": 0.01085086, "auxiliary_loss_mlp": 0.01054049, "balance_loss_clip": 1.04356313, "balance_loss_mlp": 1.03728211, "epoch": 0.3053660003006163, "flos": 12312907954560.0, "grad_norm": 1.7442741256404064, "language_loss": 0.66648543, "learning_rate": 3.255919884984307e-06, "loss": 0.68787676, "num_input_tokens_seen": 109115495, "step": 5079, "time_per_iteration": 2.746490716934204 }, { "auxiliary_loss_clip": 0.01127155, "auxiliary_loss_mlp": 0.01040504, "balance_loss_clip": 1.04811811, "balance_loss_mlp": 1.0248282, "epoch": 0.30542612355328425, "flos": 23112287504640.0, "grad_norm": 2.3583709354228213, "language_loss": 0.79841697, "learning_rate": 3.2556167643097477e-06, "loss": 0.82009357, "num_input_tokens_seen": 109134235, "step": 5080, "time_per_iteration": 2.7156612873077393 }, { "auxiliary_loss_clip": 0.01124116, "auxiliary_loss_mlp": 0.00772863, "balance_loss_clip": 1.04919219, "balance_loss_mlp": 1.00125837, "epoch": 0.3054862468059522, "flos": 24389450461440.0, "grad_norm": 2.2636550763480074, "language_loss": 0.81280053, "learning_rate": 3.255313596022074e-06, "loss": 0.8317703, "num_input_tokens_seen": 109152760, "step": 5081, "time_per_iteration": 2.6763248443603516 }, { "auxiliary_loss_clip": 0.01120003, "auxiliary_loss_mlp": 0.01044443, "balance_loss_clip": 1.04644883, "balance_loss_mlp": 1.02843297, "epoch": 0.3055463700586202, "flos": 29386096704000.0, "grad_norm": 7.924214405919456, "language_loss": 0.71839154, "learning_rate": 3.255010380132783e-06, "loss": 0.74003601, "num_input_tokens_seen": 109173925, "step": 5082, "time_per_iteration": 2.7159903049468994 }, { "auxiliary_loss_clip": 0.0112721, "auxiliary_loss_mlp": 0.01043614, "balance_loss_clip": 1.04611564, "balance_loss_mlp": 1.02554226, "epoch": 0.30560649331128814, "flos": 25591775431680.0, "grad_norm": 2.25447896755926, "language_loss": 0.73108822, "learning_rate": 3.2547071166533736e-06, "loss": 0.75279647, "num_input_tokens_seen": 109192510, "step": 5083, "time_per_iteration": 2.646739959716797 }, { "auxiliary_loss_clip": 0.01107487, "auxiliary_loss_mlp": 0.00775151, "balance_loss_clip": 1.04263341, "balance_loss_mlp": 1.00127327, "epoch": 0.3056666165639561, "flos": 19128321400320.0, "grad_norm": 1.7470718607902291, "language_loss": 0.71378291, "learning_rate": 3.254403805595344e-06, "loss": 0.73260927, "num_input_tokens_seen": 109210885, "step": 5084, "time_per_iteration": 2.6846230030059814 }, { "auxiliary_loss_clip": 0.01099017, "auxiliary_loss_mlp": 0.01047221, "balance_loss_clip": 1.04366112, "balance_loss_mlp": 1.02929187, "epoch": 0.30572673981662407, "flos": 15523860441600.0, "grad_norm": 1.8852357422602322, "language_loss": 0.78966236, "learning_rate": 3.2541004469701962e-06, "loss": 0.81112474, "num_input_tokens_seen": 109229180, "step": 5085, "time_per_iteration": 2.7193636894226074 }, { "auxiliary_loss_clip": 0.01130512, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.04483652, "balance_loss_mlp": 1.01910806, "epoch": 0.30578686306929204, "flos": 21506541909120.0, "grad_norm": 1.9742516674355037, "language_loss": 0.78476739, "learning_rate": 3.2537970407894342e-06, "loss": 0.80641937, "num_input_tokens_seen": 109249510, "step": 5086, "time_per_iteration": 2.5860135555267334 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01052848, "balance_loss_clip": 1.04314184, "balance_loss_mlp": 1.03509736, "epoch": 0.30584698632196, "flos": 20954271323520.0, "grad_norm": 1.8682002339545791, "language_loss": 0.76727784, "learning_rate": 3.253493587064563e-06, "loss": 0.78884006, "num_input_tokens_seen": 109268200, "step": 5087, "time_per_iteration": 2.732639789581299 }, { "auxiliary_loss_clip": 0.01125241, "auxiliary_loss_mlp": 0.01041401, "balance_loss_clip": 1.04509556, "balance_loss_mlp": 1.02450943, "epoch": 0.30590710957462797, "flos": 24681116897280.0, "grad_norm": 2.048016576932303, "language_loss": 0.72534674, "learning_rate": 3.2531900858070885e-06, "loss": 0.74701315, "num_input_tokens_seen": 109288370, "step": 5088, "time_per_iteration": 2.66654109954834 }, { "auxiliary_loss_clip": 0.01128516, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.04584277, "balance_loss_mlp": 1.02587295, "epoch": 0.30596723282729593, "flos": 17086907744640.0, "grad_norm": 2.359735204382993, "language_loss": 0.79327172, "learning_rate": 3.252886537028521e-06, "loss": 0.8149913, "num_input_tokens_seen": 109306730, "step": 5089, "time_per_iteration": 2.613231897354126 }, { "auxiliary_loss_clip": 0.01110444, "auxiliary_loss_mlp": 0.01041514, "balance_loss_clip": 1.04634953, "balance_loss_mlp": 1.02470577, "epoch": 0.30602735607996395, "flos": 22857106308480.0, "grad_norm": 1.8271327477144206, "language_loss": 0.77158219, "learning_rate": 3.2525829407403703e-06, "loss": 0.79310179, "num_input_tokens_seen": 109327360, "step": 5090, "time_per_iteration": 2.7469358444213867 }, { "auxiliary_loss_clip": 0.01116264, "auxiliary_loss_mlp": 0.01050158, "balance_loss_clip": 1.04506445, "balance_loss_mlp": 1.03317034, "epoch": 0.3060874793326319, "flos": 29861482227840.0, "grad_norm": 1.7853121536190235, "language_loss": 0.76108491, "learning_rate": 3.2522792969541488e-06, "loss": 0.78274912, "num_input_tokens_seen": 109348135, "step": 5091, "time_per_iteration": 2.7344727516174316 }, { "auxiliary_loss_clip": 0.01076722, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.04582906, "balance_loss_mlp": 1.02905178, "epoch": 0.3061476025852999, "flos": 20448577699200.0, "grad_norm": 1.9985396703734173, "language_loss": 0.71938324, "learning_rate": 3.2519756056813705e-06, "loss": 0.74064058, "num_input_tokens_seen": 109366220, "step": 5092, "time_per_iteration": 2.767212390899658 }, { "auxiliary_loss_clip": 0.01114871, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.04740167, "balance_loss_mlp": 1.0246855, "epoch": 0.30620772583796785, "flos": 19391475415680.0, "grad_norm": 3.231748461445431, "language_loss": 0.82655406, "learning_rate": 3.2516718669335522e-06, "loss": 0.84810787, "num_input_tokens_seen": 109385260, "step": 5093, "time_per_iteration": 2.705643892288208 }, { "auxiliary_loss_clip": 0.01136927, "auxiliary_loss_mlp": 0.00773786, "balance_loss_clip": 1.04842925, "balance_loss_mlp": 1.00142932, "epoch": 0.3062678490906358, "flos": 24024562151040.0, "grad_norm": 1.6185046249293755, "language_loss": 0.75340986, "learning_rate": 3.2513680807222114e-06, "loss": 0.77251703, "num_input_tokens_seen": 109405025, "step": 5094, "time_per_iteration": 2.6171963214874268 }, { "auxiliary_loss_clip": 0.01112613, "auxiliary_loss_mlp": 0.01042135, "balance_loss_clip": 1.04798305, "balance_loss_mlp": 1.02639914, "epoch": 0.3063279723433038, "flos": 19754639873280.0, "grad_norm": 2.1053112950674824, "language_loss": 0.75988996, "learning_rate": 3.251064247058868e-06, "loss": 0.7814374, "num_input_tokens_seen": 109422465, "step": 5095, "time_per_iteration": 2.7002673149108887 }, { "auxiliary_loss_clip": 0.0112272, "auxiliary_loss_mlp": 0.01043966, "balance_loss_clip": 1.04654729, "balance_loss_mlp": 1.0278492, "epoch": 0.30638809559597174, "flos": 22450022496000.0, "grad_norm": 8.237851994820396, "language_loss": 0.80608332, "learning_rate": 3.250760365955042e-06, "loss": 0.82775021, "num_input_tokens_seen": 109440575, "step": 5096, "time_per_iteration": 2.675551414489746 }, { "auxiliary_loss_clip": 0.01125431, "auxiliary_loss_mlp": 0.01036388, "balance_loss_clip": 1.04639602, "balance_loss_mlp": 1.02030659, "epoch": 0.3064482188486397, "flos": 17165157523200.0, "grad_norm": 3.1166257890970566, "language_loss": 0.81695235, "learning_rate": 3.250456437422258e-06, "loss": 0.83857059, "num_input_tokens_seen": 109459050, "step": 5097, "time_per_iteration": 2.6616358757019043 }, { "auxiliary_loss_clip": 0.01138165, "auxiliary_loss_mlp": 0.01042971, "balance_loss_clip": 1.04782009, "balance_loss_mlp": 1.02522099, "epoch": 0.3065083421013077, "flos": 23768483114880.0, "grad_norm": 2.1722798378639663, "language_loss": 0.78152639, "learning_rate": 3.250152461472041e-06, "loss": 0.80333775, "num_input_tokens_seen": 109475860, "step": 5098, "time_per_iteration": 2.581339120864868 }, { "auxiliary_loss_clip": 0.01093696, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.04763365, "balance_loss_mlp": 1.02302897, "epoch": 0.30656846535397564, "flos": 26431833784320.0, "grad_norm": 1.8342329708039284, "language_loss": 0.84488571, "learning_rate": 3.249848438115917e-06, "loss": 0.86622083, "num_input_tokens_seen": 109494760, "step": 5099, "time_per_iteration": 2.761580467224121 }, { "auxiliary_loss_clip": 0.0113763, "auxiliary_loss_mlp": 0.01044142, "balance_loss_clip": 1.04598331, "balance_loss_mlp": 1.02683902, "epoch": 0.3066285886066436, "flos": 26651786716800.0, "grad_norm": 1.7645297710058767, "language_loss": 0.85650218, "learning_rate": 3.2495443673654148e-06, "loss": 0.87831986, "num_input_tokens_seen": 109516480, "step": 5100, "time_per_iteration": 4.130753517150879 }, { "auxiliary_loss_clip": 0.01099546, "auxiliary_loss_mlp": 0.01040494, "balance_loss_clip": 1.04097986, "balance_loss_mlp": 1.02268374, "epoch": 0.30668871185931157, "flos": 15049947375360.0, "grad_norm": 1.8121599631247622, "language_loss": 0.78980827, "learning_rate": 3.249240249232065e-06, "loss": 0.81120867, "num_input_tokens_seen": 109534615, "step": 5101, "time_per_iteration": 4.324965000152588 }, { "auxiliary_loss_clip": 0.01102347, "auxiliary_loss_mlp": 0.01054476, "balance_loss_clip": 1.04654586, "balance_loss_mlp": 1.03549778, "epoch": 0.30674883511197953, "flos": 20082109190400.0, "grad_norm": 3.103169454759946, "language_loss": 0.8002606, "learning_rate": 3.2489360837273998e-06, "loss": 0.82182884, "num_input_tokens_seen": 109554040, "step": 5102, "time_per_iteration": 2.6799395084381104 }, { "auxiliary_loss_clip": 0.01142197, "auxiliary_loss_mlp": 0.01041215, "balance_loss_clip": 1.05097044, "balance_loss_mlp": 1.02254653, "epoch": 0.30680895836464755, "flos": 22893807029760.0, "grad_norm": 2.1213785434731416, "language_loss": 0.88774347, "learning_rate": 3.2486318708629532e-06, "loss": 0.90957761, "num_input_tokens_seen": 109574345, "step": 5103, "time_per_iteration": 2.65173077583313 }, { "auxiliary_loss_clip": 0.01117159, "auxiliary_loss_mlp": 0.01047865, "balance_loss_clip": 1.04379106, "balance_loss_mlp": 1.03051972, "epoch": 0.3068690816173155, "flos": 23696159080320.0, "grad_norm": 1.7904968866721789, "language_loss": 0.73977435, "learning_rate": 3.2483276106502607e-06, "loss": 0.7614246, "num_input_tokens_seen": 109593670, "step": 5104, "time_per_iteration": 4.15887975692749 }, { "auxiliary_loss_clip": 0.01124364, "auxiliary_loss_mlp": 0.00776702, "balance_loss_clip": 1.04378068, "balance_loss_mlp": 1.00128829, "epoch": 0.3069292048699835, "flos": 23551044134400.0, "grad_norm": 3.7241561762804496, "language_loss": 0.72777617, "learning_rate": 3.2480233031008605e-06, "loss": 0.74678683, "num_input_tokens_seen": 109613385, "step": 5105, "time_per_iteration": 2.657212972640991 }, { "auxiliary_loss_clip": 0.01112354, "auxiliary_loss_mlp": 0.01041782, "balance_loss_clip": 1.0451684, "balance_loss_mlp": 1.02401972, "epoch": 0.30698932812265145, "flos": 24531656405760.0, "grad_norm": 1.9297281358185925, "language_loss": 0.87290782, "learning_rate": 3.2477189482262916e-06, "loss": 0.89444917, "num_input_tokens_seen": 109632395, "step": 5106, "time_per_iteration": 4.409428119659424 }, { "auxiliary_loss_clip": 0.0110831, "auxiliary_loss_mlp": 0.01052851, "balance_loss_clip": 1.04540682, "balance_loss_mlp": 1.03390849, "epoch": 0.3070494513753194, "flos": 20996430912000.0, "grad_norm": 2.254355123120303, "language_loss": 0.71420276, "learning_rate": 3.2474145460380945e-06, "loss": 0.73581433, "num_input_tokens_seen": 109651380, "step": 5107, "time_per_iteration": 2.7320871353149414 }, { "auxiliary_loss_clip": 0.01101295, "auxiliary_loss_mlp": 0.0104767, "balance_loss_clip": 1.04618347, "balance_loss_mlp": 1.03034878, "epoch": 0.3071095746279874, "flos": 19025940660480.0, "grad_norm": 2.1230574515432705, "language_loss": 0.72282934, "learning_rate": 3.247110096547814e-06, "loss": 0.74431896, "num_input_tokens_seen": 109670240, "step": 5108, "time_per_iteration": 2.720196485519409 }, { "auxiliary_loss_clip": 0.01112658, "auxiliary_loss_mlp": 0.01040837, "balance_loss_clip": 1.04619241, "balance_loss_mlp": 1.02325416, "epoch": 0.30716969788065535, "flos": 21215521918080.0, "grad_norm": 3.0053852764205695, "language_loss": 0.8601433, "learning_rate": 3.2468055997669926e-06, "loss": 0.88167822, "num_input_tokens_seen": 109690810, "step": 5109, "time_per_iteration": 2.715580940246582 }, { "auxiliary_loss_clip": 0.01109383, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.04432368, "balance_loss_mlp": 1.02017736, "epoch": 0.3072298211333233, "flos": 25772765086080.0, "grad_norm": 1.7463183423202828, "language_loss": 0.67169911, "learning_rate": 3.2465010557071788e-06, "loss": 0.69316053, "num_input_tokens_seen": 109711145, "step": 5110, "time_per_iteration": 2.7133336067199707 }, { "auxiliary_loss_clip": 0.01126653, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.04854119, "balance_loss_mlp": 1.01736796, "epoch": 0.3072899443859913, "flos": 25848931875840.0, "grad_norm": 1.4548971516988844, "language_loss": 0.76673061, "learning_rate": 3.246196464379919e-06, "loss": 0.78833127, "num_input_tokens_seen": 109731425, "step": 5111, "time_per_iteration": 2.692505121231079 }, { "auxiliary_loss_clip": 0.01140411, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.04979658, "balance_loss_mlp": 1.02360249, "epoch": 0.30735006763865924, "flos": 25922800195200.0, "grad_norm": 3.7694679470365244, "language_loss": 0.67143333, "learning_rate": 3.245891825796765e-06, "loss": 0.69323719, "num_input_tokens_seen": 109752720, "step": 5112, "time_per_iteration": 2.6441125869750977 }, { "auxiliary_loss_clip": 0.01133822, "auxiliary_loss_mlp": 0.01044497, "balance_loss_clip": 1.05147326, "balance_loss_mlp": 1.02482784, "epoch": 0.3074101908913272, "flos": 30917004312960.0, "grad_norm": 2.062737517485213, "language_loss": 0.79524493, "learning_rate": 3.2455871399692678e-06, "loss": 0.81702805, "num_input_tokens_seen": 109772840, "step": 5113, "time_per_iteration": 2.7166647911071777 }, { "auxiliary_loss_clip": 0.01102438, "auxiliary_loss_mlp": 0.00774651, "balance_loss_clip": 1.04638815, "balance_loss_mlp": 1.00138378, "epoch": 0.30747031414399517, "flos": 18401058731520.0, "grad_norm": 2.08885217843665, "language_loss": 0.76926446, "learning_rate": 3.2452824069089815e-06, "loss": 0.78803539, "num_input_tokens_seen": 109790150, "step": 5114, "time_per_iteration": 2.6842217445373535 }, { "auxiliary_loss_clip": 0.01100955, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.0446732, "balance_loss_mlp": 1.01589036, "epoch": 0.30753043739666314, "flos": 22633166966400.0, "grad_norm": 2.179333764681939, "language_loss": 0.62607706, "learning_rate": 3.2449776266274623e-06, "loss": 0.64743078, "num_input_tokens_seen": 109807985, "step": 5115, "time_per_iteration": 2.7709848880767822 }, { "auxiliary_loss_clip": 0.0113067, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.04829907, "balance_loss_mlp": 1.02557516, "epoch": 0.3075905606493311, "flos": 27344072517120.0, "grad_norm": 2.4707888757665684, "language_loss": 0.82835108, "learning_rate": 3.2446727991362657e-06, "loss": 0.85007656, "num_input_tokens_seen": 109825920, "step": 5116, "time_per_iteration": 2.6891255378723145 }, { "auxiliary_loss_clip": 0.01115169, "auxiliary_loss_mlp": 0.01050095, "balance_loss_clip": 1.04928303, "balance_loss_mlp": 1.03291702, "epoch": 0.3076506839019991, "flos": 22090808534400.0, "grad_norm": 1.792550086960714, "language_loss": 0.75943851, "learning_rate": 3.244367924446952e-06, "loss": 0.78109109, "num_input_tokens_seen": 109846220, "step": 5117, "time_per_iteration": 2.6685919761657715 }, { "auxiliary_loss_clip": 0.01096356, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.04583359, "balance_loss_mlp": 1.02309084, "epoch": 0.3077108071546671, "flos": 21289533891840.0, "grad_norm": 2.509228810910763, "language_loss": 0.71450555, "learning_rate": 3.2440630025710826e-06, "loss": 0.7358911, "num_input_tokens_seen": 109863870, "step": 5118, "time_per_iteration": 2.7360472679138184 }, { "auxiliary_loss_clip": 0.0109679, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.05069757, "balance_loss_mlp": 1.02279758, "epoch": 0.30777093040733505, "flos": 21430985650560.0, "grad_norm": 1.6950758291291428, "language_loss": 0.74499059, "learning_rate": 3.243758033520219e-06, "loss": 0.76635897, "num_input_tokens_seen": 109883500, "step": 5119, "time_per_iteration": 2.7963552474975586 }, { "auxiliary_loss_clip": 0.01133391, "auxiliary_loss_mlp": 0.01054336, "balance_loss_clip": 1.05088997, "balance_loss_mlp": 1.03520322, "epoch": 0.307831053660003, "flos": 23149275534720.0, "grad_norm": 2.3083726349779785, "language_loss": 0.79968077, "learning_rate": 3.243453017305926e-06, "loss": 0.821558, "num_input_tokens_seen": 109904620, "step": 5120, "time_per_iteration": 2.7600536346435547 }, { "auxiliary_loss_clip": 0.01127117, "auxiliary_loss_mlp": 0.01045491, "balance_loss_clip": 1.04772663, "balance_loss_mlp": 1.02994657, "epoch": 0.307891176912671, "flos": 17019755268480.0, "grad_norm": 1.7119475154385397, "language_loss": 0.79864663, "learning_rate": 3.24314795393977e-06, "loss": 0.8203727, "num_input_tokens_seen": 109922275, "step": 5121, "time_per_iteration": 2.6204211711883545 }, { "auxiliary_loss_clip": 0.01105091, "auxiliary_loss_mlp": 0.01039616, "balance_loss_clip": 1.04669154, "balance_loss_mlp": 1.02292657, "epoch": 0.30795130016533895, "flos": 27705046245120.0, "grad_norm": 1.4682711249191758, "language_loss": 0.82526803, "learning_rate": 3.242842843433319e-06, "loss": 0.84671509, "num_input_tokens_seen": 109944265, "step": 5122, "time_per_iteration": 2.7210805416107178 }, { "auxiliary_loss_clip": 0.01052784, "auxiliary_loss_mlp": 0.01010188, "balance_loss_clip": 1.03048515, "balance_loss_mlp": 1.00826919, "epoch": 0.3080114234180069, "flos": 69058699591680.0, "grad_norm": 0.7449761063336078, "language_loss": 0.58609217, "learning_rate": 3.242537685798143e-06, "loss": 0.60672188, "num_input_tokens_seen": 110014160, "step": 5123, "time_per_iteration": 3.303093433380127 }, { "auxiliary_loss_clip": 0.01133855, "auxiliary_loss_mlp": 0.00776294, "balance_loss_clip": 1.04937184, "balance_loss_mlp": 1.00136161, "epoch": 0.3080715466706749, "flos": 24060221377920.0, "grad_norm": 1.5927838238117058, "language_loss": 0.83550704, "learning_rate": 3.242232481045813e-06, "loss": 0.85460854, "num_input_tokens_seen": 110034865, "step": 5124, "time_per_iteration": 2.7226438522338867 }, { "auxiliary_loss_clip": 0.01143185, "auxiliary_loss_mlp": 0.01038734, "balance_loss_clip": 1.05123234, "balance_loss_mlp": 1.02206898, "epoch": 0.30813166992334284, "flos": 25848680480640.0, "grad_norm": 2.0767599752543657, "language_loss": 0.79332423, "learning_rate": 3.2419272291879035e-06, "loss": 0.81514347, "num_input_tokens_seen": 110052930, "step": 5125, "time_per_iteration": 2.6514153480529785 }, { "auxiliary_loss_clip": 0.01125892, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.04636812, "balance_loss_mlp": 1.01694369, "epoch": 0.3081917931760108, "flos": 20449619193600.0, "grad_norm": 1.764828299724452, "language_loss": 0.64689863, "learning_rate": 3.241621930235989e-06, "loss": 0.66851032, "num_input_tokens_seen": 110071765, "step": 5126, "time_per_iteration": 2.6408963203430176 }, { "auxiliary_loss_clip": 0.01099238, "auxiliary_loss_mlp": 0.01044536, "balance_loss_clip": 1.05009556, "balance_loss_mlp": 1.02698874, "epoch": 0.3082519164286788, "flos": 22166257052160.0, "grad_norm": 1.5302214532460006, "language_loss": 0.86800975, "learning_rate": 3.241316584201646e-06, "loss": 0.88944745, "num_input_tokens_seen": 110092660, "step": 5127, "time_per_iteration": 2.793318748474121 }, { "auxiliary_loss_clip": 0.01086461, "auxiliary_loss_mlp": 0.01045743, "balance_loss_clip": 1.04368591, "balance_loss_mlp": 1.02862501, "epoch": 0.30831203968134674, "flos": 28913404700160.0, "grad_norm": 1.6968110238499217, "language_loss": 0.69155616, "learning_rate": 3.2410111910964538e-06, "loss": 0.71287817, "num_input_tokens_seen": 110114960, "step": 5128, "time_per_iteration": 2.777060031890869 }, { "auxiliary_loss_clip": 0.01130807, "auxiliary_loss_mlp": 0.00775186, "balance_loss_clip": 1.05044532, "balance_loss_mlp": 1.00153518, "epoch": 0.3083721629340147, "flos": 25667726739840.0, "grad_norm": 1.7900045405252538, "language_loss": 0.71075535, "learning_rate": 3.240705750931993e-06, "loss": 0.7298153, "num_input_tokens_seen": 110135750, "step": 5129, "time_per_iteration": 2.7317588329315186 }, { "auxiliary_loss_clip": 0.01030892, "auxiliary_loss_mlp": 0.01007708, "balance_loss_clip": 1.0286324, "balance_loss_mlp": 1.00588405, "epoch": 0.3084322861866827, "flos": 68212679581440.0, "grad_norm": 0.8221299931057983, "language_loss": 0.59160221, "learning_rate": 3.240400263719846e-06, "loss": 0.61198819, "num_input_tokens_seen": 110189480, "step": 5130, "time_per_iteration": 3.2141849994659424 }, { "auxiliary_loss_clip": 0.01115906, "auxiliary_loss_mlp": 0.01041214, "balance_loss_clip": 1.04513061, "balance_loss_mlp": 1.02297497, "epoch": 0.3084924094393507, "flos": 20296495514880.0, "grad_norm": 2.986922621878904, "language_loss": 0.73292506, "learning_rate": 3.2400947294715957e-06, "loss": 0.75449622, "num_input_tokens_seen": 110206445, "step": 5131, "time_per_iteration": 2.6520204544067383 }, { "auxiliary_loss_clip": 0.01099541, "auxiliary_loss_mlp": 0.010345, "balance_loss_clip": 1.04438055, "balance_loss_mlp": 1.01822817, "epoch": 0.30855253269201866, "flos": 23949831905280.0, "grad_norm": 1.569237882810685, "language_loss": 0.71420097, "learning_rate": 3.2397891481988303e-06, "loss": 0.73554134, "num_input_tokens_seen": 110226845, "step": 5132, "time_per_iteration": 2.8439948558807373 }, { "auxiliary_loss_clip": 0.01134935, "auxiliary_loss_mlp": 0.00774998, "balance_loss_clip": 1.04922795, "balance_loss_mlp": 1.00131333, "epoch": 0.3086126559446866, "flos": 19281876042240.0, "grad_norm": 1.9070570981004293, "language_loss": 0.89846021, "learning_rate": 3.239483519913136e-06, "loss": 0.91755956, "num_input_tokens_seen": 110244095, "step": 5133, "time_per_iteration": 2.5872273445129395 }, { "auxiliary_loss_clip": 0.01122429, "auxiliary_loss_mlp": 0.01043613, "balance_loss_clip": 1.04856205, "balance_loss_mlp": 1.02580321, "epoch": 0.3086727791973546, "flos": 33760770019200.0, "grad_norm": 1.7209646054950307, "language_loss": 0.67267555, "learning_rate": 3.239177844626102e-06, "loss": 0.69433594, "num_input_tokens_seen": 110264240, "step": 5134, "time_per_iteration": 2.7872183322906494 }, { "auxiliary_loss_clip": 0.01124541, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.04777277, "balance_loss_mlp": 1.02393556, "epoch": 0.30873290245002255, "flos": 16034151006720.0, "grad_norm": 1.9145067593542924, "language_loss": 0.82794344, "learning_rate": 3.2388721223493197e-06, "loss": 0.84960246, "num_input_tokens_seen": 110282450, "step": 5135, "time_per_iteration": 2.6355140209198 }, { "auxiliary_loss_clip": 0.01026512, "auxiliary_loss_mlp": 0.01003035, "balance_loss_clip": 1.02417064, "balance_loss_mlp": 1.00113988, "epoch": 0.3087930257026905, "flos": 65048304055680.0, "grad_norm": 0.6923211570832432, "language_loss": 0.55314827, "learning_rate": 3.2385663530943824e-06, "loss": 0.57344365, "num_input_tokens_seen": 110343715, "step": 5136, "time_per_iteration": 3.31300687789917 }, { "auxiliary_loss_clip": 0.01118007, "auxiliary_loss_mlp": 0.00775624, "balance_loss_clip": 1.04826593, "balance_loss_mlp": 1.00124264, "epoch": 0.3088531489553585, "flos": 74738829824640.0, "grad_norm": 2.038560176689262, "language_loss": 0.76524079, "learning_rate": 3.2382605368728852e-06, "loss": 0.78417706, "num_input_tokens_seen": 110368430, "step": 5137, "time_per_iteration": 3.1237831115722656 }, { "auxiliary_loss_clip": 0.01102933, "auxiliary_loss_mlp": 0.010362, "balance_loss_clip": 1.04592168, "balance_loss_mlp": 1.02058411, "epoch": 0.30891327220802645, "flos": 21142300043520.0, "grad_norm": 1.655645044155811, "language_loss": 0.80083114, "learning_rate": 3.237954673696424e-06, "loss": 0.82222247, "num_input_tokens_seen": 110386735, "step": 5138, "time_per_iteration": 2.775902509689331 }, { "auxiliary_loss_clip": 0.01078807, "auxiliary_loss_mlp": 0.0104514, "balance_loss_clip": 1.03953338, "balance_loss_mlp": 1.02583957, "epoch": 0.3089733954606944, "flos": 25664494515840.0, "grad_norm": 1.3823165076112356, "language_loss": 0.81288958, "learning_rate": 3.2376487635765983e-06, "loss": 0.8341291, "num_input_tokens_seen": 110406820, "step": 5139, "time_per_iteration": 4.48141074180603 }, { "auxiliary_loss_clip": 0.01127056, "auxiliary_loss_mlp": 0.01044845, "balance_loss_clip": 1.04565382, "balance_loss_mlp": 1.02575994, "epoch": 0.3090335187133624, "flos": 19427350124160.0, "grad_norm": 2.1511159973406593, "language_loss": 0.77260494, "learning_rate": 3.2373428065250067e-06, "loss": 0.79432398, "num_input_tokens_seen": 110424225, "step": 5140, "time_per_iteration": 4.1141037940979 }, { "auxiliary_loss_clip": 0.01099157, "auxiliary_loss_mlp": 0.01048812, "balance_loss_clip": 1.04282403, "balance_loss_mlp": 1.03233695, "epoch": 0.30909364196603034, "flos": 20011329440640.0, "grad_norm": 1.77105935640331, "language_loss": 0.78806967, "learning_rate": 3.237036802553252e-06, "loss": 0.80954939, "num_input_tokens_seen": 110443310, "step": 5141, "time_per_iteration": 2.6497676372528076 }, { "auxiliary_loss_clip": 0.01119702, "auxiliary_loss_mlp": 0.0104967, "balance_loss_clip": 1.04679799, "balance_loss_mlp": 1.03138292, "epoch": 0.3091537652186983, "flos": 19677575243520.0, "grad_norm": 2.261971688212118, "language_loss": 0.86853915, "learning_rate": 3.2367307516729377e-06, "loss": 0.89023286, "num_input_tokens_seen": 110460215, "step": 5142, "time_per_iteration": 2.635495662689209 }, { "auxiliary_loss_clip": 0.01127738, "auxiliary_loss_mlp": 0.01048033, "balance_loss_clip": 1.04709148, "balance_loss_mlp": 1.03136778, "epoch": 0.3092138884713663, "flos": 17020042577280.0, "grad_norm": 1.7222677689082588, "language_loss": 0.79352587, "learning_rate": 3.23642465389567e-06, "loss": 0.81528366, "num_input_tokens_seen": 110479385, "step": 5143, "time_per_iteration": 2.672196388244629 }, { "auxiliary_loss_clip": 0.01108121, "auxiliary_loss_mlp": 0.01046466, "balance_loss_clip": 1.04830873, "balance_loss_mlp": 1.02858496, "epoch": 0.3092740117240343, "flos": 25009986844800.0, "grad_norm": 1.849759687088619, "language_loss": 0.72079581, "learning_rate": 3.236118509233055e-06, "loss": 0.7423417, "num_input_tokens_seen": 110499885, "step": 5144, "time_per_iteration": 4.2138121128082275 }, { "auxiliary_loss_clip": 0.01130266, "auxiliary_loss_mlp": 0.0105055, "balance_loss_clip": 1.04617548, "balance_loss_mlp": 1.03297877, "epoch": 0.30933413497670226, "flos": 25590410714880.0, "grad_norm": 1.9804845877808144, "language_loss": 0.74328083, "learning_rate": 3.235812317696702e-06, "loss": 0.76508898, "num_input_tokens_seen": 110519690, "step": 5145, "time_per_iteration": 4.315273761749268 }, { "auxiliary_loss_clip": 0.01110927, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.04372048, "balance_loss_mlp": 1.02788365, "epoch": 0.3093942582293702, "flos": 24389665943040.0, "grad_norm": 1.6657569174801012, "language_loss": 0.76391518, "learning_rate": 3.2355060792982224e-06, "loss": 0.78547978, "num_input_tokens_seen": 110540520, "step": 5146, "time_per_iteration": 2.7259135246276855 }, { "auxiliary_loss_clip": 0.0111122, "auxiliary_loss_mlp": 0.01042459, "balance_loss_clip": 1.04380584, "balance_loss_mlp": 1.02553141, "epoch": 0.3094543814820382, "flos": 19646441130240.0, "grad_norm": 2.148705061921787, "language_loss": 0.66899967, "learning_rate": 3.2351997940492286e-06, "loss": 0.6905365, "num_input_tokens_seen": 110557950, "step": 5147, "time_per_iteration": 2.6804444789886475 }, { "auxiliary_loss_clip": 0.01132642, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.04998684, "balance_loss_mlp": 1.0238843, "epoch": 0.30951450473470615, "flos": 25663812157440.0, "grad_norm": 2.0634223914225585, "language_loss": 0.74823105, "learning_rate": 3.2348934619613346e-06, "loss": 0.76996237, "num_input_tokens_seen": 110578215, "step": 5148, "time_per_iteration": 2.637509346008301 }, { "auxiliary_loss_clip": 0.0113505, "auxiliary_loss_mlp": 0.01047495, "balance_loss_clip": 1.0492146, "balance_loss_mlp": 1.02901721, "epoch": 0.3095746279873741, "flos": 12020415505920.0, "grad_norm": 2.1367843023537287, "language_loss": 0.73082036, "learning_rate": 3.2345870830461567e-06, "loss": 0.75264585, "num_input_tokens_seen": 110592990, "step": 5149, "time_per_iteration": 2.6134157180786133 }, { "auxiliary_loss_clip": 0.01097892, "auxiliary_loss_mlp": 0.0104428, "balance_loss_clip": 1.04601955, "balance_loss_mlp": 1.02615988, "epoch": 0.3096347512400421, "flos": 23623044946560.0, "grad_norm": 2.0797901111423274, "language_loss": 0.845025, "learning_rate": 3.2342806573153132e-06, "loss": 0.86644673, "num_input_tokens_seen": 110612130, "step": 5150, "time_per_iteration": 2.7804181575775146 }, { "auxiliary_loss_clip": 0.01086512, "auxiliary_loss_mlp": 0.01047133, "balance_loss_clip": 1.04168093, "balance_loss_mlp": 1.02820301, "epoch": 0.30969487449271005, "flos": 22529313768960.0, "grad_norm": 1.8768941622145223, "language_loss": 0.78431082, "learning_rate": 3.233974184780424e-06, "loss": 0.80564725, "num_input_tokens_seen": 110632045, "step": 5151, "time_per_iteration": 2.7539470195770264 }, { "auxiliary_loss_clip": 0.01131879, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.04880977, "balance_loss_mlp": 1.02362132, "epoch": 0.309754997745378, "flos": 15267925059840.0, "grad_norm": 1.9606136965084777, "language_loss": 0.67416716, "learning_rate": 3.2336676654531084e-06, "loss": 0.69590038, "num_input_tokens_seen": 110649340, "step": 5152, "time_per_iteration": 2.579238176345825 }, { "auxiliary_loss_clip": 0.01080518, "auxiliary_loss_mlp": 0.01045921, "balance_loss_clip": 1.04402971, "balance_loss_mlp": 1.02807546, "epoch": 0.309815120998046, "flos": 26979291947520.0, "grad_norm": 5.6670540450328355, "language_loss": 0.8251189, "learning_rate": 3.2333610993449926e-06, "loss": 0.84638333, "num_input_tokens_seen": 110668450, "step": 5153, "time_per_iteration": 2.792285203933716 }, { "auxiliary_loss_clip": 0.01113849, "auxiliary_loss_mlp": 0.00775793, "balance_loss_clip": 1.04663801, "balance_loss_mlp": 1.00127769, "epoch": 0.30987524425071394, "flos": 21143161969920.0, "grad_norm": 1.937189485762574, "language_loss": 0.73793215, "learning_rate": 3.2330544864676997e-06, "loss": 0.75682855, "num_input_tokens_seen": 110689410, "step": 5154, "time_per_iteration": 2.678454875946045 }, { "auxiliary_loss_clip": 0.01132509, "auxiliary_loss_mlp": 0.0103738, "balance_loss_clip": 1.0507983, "balance_loss_mlp": 1.02009416, "epoch": 0.3099353675033819, "flos": 15268284195840.0, "grad_norm": 2.1601099672999586, "language_loss": 0.76069349, "learning_rate": 3.232747826832858e-06, "loss": 0.78239238, "num_input_tokens_seen": 110707350, "step": 5155, "time_per_iteration": 2.577634334564209 }, { "auxiliary_loss_clip": 0.01131155, "auxiliary_loss_mlp": 0.01040429, "balance_loss_clip": 1.05483913, "balance_loss_mlp": 1.02283418, "epoch": 0.30999549075604993, "flos": 15413794191360.0, "grad_norm": 2.044896457109867, "language_loss": 0.79096609, "learning_rate": 3.232441120452094e-06, "loss": 0.81268191, "num_input_tokens_seen": 110724910, "step": 5156, "time_per_iteration": 2.628363609313965 }, { "auxiliary_loss_clip": 0.01127429, "auxiliary_loss_mlp": 0.01047381, "balance_loss_clip": 1.04775023, "balance_loss_mlp": 1.02779543, "epoch": 0.3100556140087179, "flos": 23184539712000.0, "grad_norm": 2.468311845454126, "language_loss": 0.74950963, "learning_rate": 3.23213436733704e-06, "loss": 0.77125776, "num_input_tokens_seen": 110744010, "step": 5157, "time_per_iteration": 2.6231181621551514 }, { "auxiliary_loss_clip": 0.01108321, "auxiliary_loss_mlp": 0.01042715, "balance_loss_clip": 1.04868615, "balance_loss_mlp": 1.02634752, "epoch": 0.31011573726138586, "flos": 25742169676800.0, "grad_norm": 1.6453166696914168, "language_loss": 0.69648343, "learning_rate": 3.231827567499327e-06, "loss": 0.71799374, "num_input_tokens_seen": 110765835, "step": 5158, "time_per_iteration": 2.734889030456543 }, { "auxiliary_loss_clip": 0.01095116, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.04443944, "balance_loss_mlp": 1.0301435, "epoch": 0.3101758605140538, "flos": 20011329440640.0, "grad_norm": 1.9329481500014836, "language_loss": 0.84861457, "learning_rate": 3.2315207209505896e-06, "loss": 0.87002677, "num_input_tokens_seen": 110784655, "step": 5159, "time_per_iteration": 2.665311813354492 }, { "auxiliary_loss_clip": 0.01116498, "auxiliary_loss_mlp": 0.01046065, "balance_loss_clip": 1.04710639, "balance_loss_mlp": 1.02877951, "epoch": 0.3102359837667218, "flos": 19135683688320.0, "grad_norm": 1.9614748869944683, "language_loss": 0.85129201, "learning_rate": 3.231213827702462e-06, "loss": 0.87291765, "num_input_tokens_seen": 110802545, "step": 5160, "time_per_iteration": 2.597130298614502 }, { "auxiliary_loss_clip": 0.01133056, "auxiliary_loss_mlp": 0.01042602, "balance_loss_clip": 1.0520395, "balance_loss_mlp": 1.02582884, "epoch": 0.31029610701938976, "flos": 22265405568000.0, "grad_norm": 1.9459577302566504, "language_loss": 0.75555152, "learning_rate": 3.230906887766584e-06, "loss": 0.77730811, "num_input_tokens_seen": 110820265, "step": 5161, "time_per_iteration": 2.583240032196045 }, { "auxiliary_loss_clip": 0.0113313, "auxiliary_loss_mlp": 0.01045414, "balance_loss_clip": 1.05046988, "balance_loss_mlp": 1.02797401, "epoch": 0.3103562302720577, "flos": 20805349536000.0, "grad_norm": 1.9938857241338979, "language_loss": 0.8156144, "learning_rate": 3.2305999011545924e-06, "loss": 0.83739984, "num_input_tokens_seen": 110836195, "step": 5162, "time_per_iteration": 2.495689630508423 }, { "auxiliary_loss_clip": 0.01128762, "auxiliary_loss_mlp": 0.01039959, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.02450919, "epoch": 0.3104163535247257, "flos": 22344158136960.0, "grad_norm": 1.777649785974679, "language_loss": 0.82892883, "learning_rate": 3.2302928678781295e-06, "loss": 0.85061604, "num_input_tokens_seen": 110856420, "step": 5163, "time_per_iteration": 2.591036081314087 }, { "auxiliary_loss_clip": 0.01147486, "auxiliary_loss_mlp": 0.01044526, "balance_loss_clip": 1.05307984, "balance_loss_mlp": 1.0273242, "epoch": 0.31047647677739365, "flos": 21689363157120.0, "grad_norm": 1.875247009463239, "language_loss": 0.76131678, "learning_rate": 3.2299857879488376e-06, "loss": 0.78323686, "num_input_tokens_seen": 110876650, "step": 5164, "time_per_iteration": 2.5745677947998047 }, { "auxiliary_loss_clip": 0.01103275, "auxiliary_loss_mlp": 0.01046349, "balance_loss_clip": 1.04969811, "balance_loss_mlp": 1.02880108, "epoch": 0.3105366000300616, "flos": 18917275040640.0, "grad_norm": 3.462886730904856, "language_loss": 0.74514711, "learning_rate": 3.2296786613783626e-06, "loss": 0.7666434, "num_input_tokens_seen": 110894445, "step": 5165, "time_per_iteration": 2.724846124649048 }, { "auxiliary_loss_clip": 0.01100578, "auxiliary_loss_mlp": 0.01057021, "balance_loss_clip": 1.04695523, "balance_loss_mlp": 1.03841233, "epoch": 0.3105967232827296, "flos": 18260397072000.0, "grad_norm": 1.6273273492295701, "language_loss": 0.75827682, "learning_rate": 3.229371488178348e-06, "loss": 0.77985275, "num_input_tokens_seen": 110912855, "step": 5166, "time_per_iteration": 2.7309961318969727 }, { "auxiliary_loss_clip": 0.01121318, "auxiliary_loss_mlp": 0.01043526, "balance_loss_clip": 1.04969096, "balance_loss_mlp": 1.02665818, "epoch": 0.31065684653539755, "flos": 17672144037120.0, "grad_norm": 2.1635307284170833, "language_loss": 0.73621917, "learning_rate": 3.229064268360444e-06, "loss": 0.75786763, "num_input_tokens_seen": 110928025, "step": 5167, "time_per_iteration": 2.623375654220581 }, { "auxiliary_loss_clip": 0.01007539, "auxiliary_loss_mlp": 0.01008435, "balance_loss_clip": 1.02476823, "balance_loss_mlp": 1.0059557, "epoch": 0.3107169697880655, "flos": 68531996511360.0, "grad_norm": 0.7113763854018822, "language_loss": 0.53030008, "learning_rate": 3.2287570019362997e-06, "loss": 0.55045986, "num_input_tokens_seen": 110992215, "step": 5168, "time_per_iteration": 3.3115129470825195 }, { "auxiliary_loss_clip": 0.01138497, "auxiliary_loss_mlp": 0.01050074, "balance_loss_clip": 1.05561399, "balance_loss_mlp": 1.03151321, "epoch": 0.3107770930407335, "flos": 13188733274880.0, "grad_norm": 3.621905149464154, "language_loss": 0.79032969, "learning_rate": 3.2284496889175668e-06, "loss": 0.81221539, "num_input_tokens_seen": 111010400, "step": 5169, "time_per_iteration": 2.595463514328003 }, { "auxiliary_loss_clip": 0.01121822, "auxiliary_loss_mlp": 0.01047209, "balance_loss_clip": 1.04804373, "balance_loss_mlp": 1.02937579, "epoch": 0.3108372162934015, "flos": 31580849520000.0, "grad_norm": 1.57130024638105, "language_loss": 0.64071, "learning_rate": 3.2281423293158986e-06, "loss": 0.66240036, "num_input_tokens_seen": 111033960, "step": 5170, "time_per_iteration": 2.746469497680664 }, { "auxiliary_loss_clip": 0.0110491, "auxiliary_loss_mlp": 0.00776539, "balance_loss_clip": 1.04874384, "balance_loss_mlp": 1.00120461, "epoch": 0.31089733954606946, "flos": 28729829266560.0, "grad_norm": 2.172069963879317, "language_loss": 0.7723515, "learning_rate": 3.22783492314295e-06, "loss": 0.79116607, "num_input_tokens_seen": 111053265, "step": 5171, "time_per_iteration": 2.776974678039551 }, { "auxiliary_loss_clip": 0.01100832, "auxiliary_loss_mlp": 0.01048172, "balance_loss_clip": 1.049088, "balance_loss_mlp": 1.03055298, "epoch": 0.3109574627987374, "flos": 19683249592320.0, "grad_norm": 1.830523579545495, "language_loss": 0.84020013, "learning_rate": 3.2275274704103785e-06, "loss": 0.86169016, "num_input_tokens_seen": 111071130, "step": 5172, "time_per_iteration": 2.718118906021118 }, { "auxiliary_loss_clip": 0.01091688, "auxiliary_loss_mlp": 0.01045541, "balance_loss_clip": 1.04622412, "balance_loss_mlp": 1.02706313, "epoch": 0.3110175860514054, "flos": 14683981656960.0, "grad_norm": 1.9540355263753015, "language_loss": 0.83730888, "learning_rate": 3.227219971129842e-06, "loss": 0.8586812, "num_input_tokens_seen": 111089560, "step": 5173, "time_per_iteration": 2.735163927078247 }, { "auxiliary_loss_clip": 0.01145239, "auxiliary_loss_mlp": 0.01042621, "balance_loss_clip": 1.05589437, "balance_loss_mlp": 1.02656341, "epoch": 0.31107770930407336, "flos": 25739655724800.0, "grad_norm": 3.2612368513370495, "language_loss": 0.83354348, "learning_rate": 3.226912425313001e-06, "loss": 0.85542202, "num_input_tokens_seen": 111109960, "step": 5174, "time_per_iteration": 2.65226411819458 }, { "auxiliary_loss_clip": 0.01122854, "auxiliary_loss_mlp": 0.01046101, "balance_loss_clip": 1.05162597, "balance_loss_mlp": 1.02928042, "epoch": 0.3111378325567413, "flos": 19208259118080.0, "grad_norm": 1.9777752297496725, "language_loss": 0.85181922, "learning_rate": 3.2266048329715183e-06, "loss": 0.87350869, "num_input_tokens_seen": 111127960, "step": 5175, "time_per_iteration": 2.6930692195892334 }, { "auxiliary_loss_clip": 0.01087659, "auxiliary_loss_mlp": 0.01044685, "balance_loss_clip": 1.04638839, "balance_loss_mlp": 1.02623129, "epoch": 0.3111979558094093, "flos": 23696374561920.0, "grad_norm": 1.845729409399547, "language_loss": 0.82990116, "learning_rate": 3.2262971941170575e-06, "loss": 0.8512246, "num_input_tokens_seen": 111146730, "step": 5176, "time_per_iteration": 2.7975289821624756 }, { "auxiliary_loss_clip": 0.01126555, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.04662132, "balance_loss_mlp": 1.02361798, "epoch": 0.31125807906207725, "flos": 21033023892480.0, "grad_norm": 1.9258407965023028, "language_loss": 0.8096348, "learning_rate": 3.2259895087612837e-06, "loss": 0.83132547, "num_input_tokens_seen": 111166295, "step": 5177, "time_per_iteration": 2.6275687217712402 }, { "auxiliary_loss_clip": 0.01134117, "auxiliary_loss_mlp": 0.0077682, "balance_loss_clip": 1.05381465, "balance_loss_mlp": 1.00119591, "epoch": 0.3113182023147452, "flos": 23076628277760.0, "grad_norm": 1.6855068015846089, "language_loss": 0.80707169, "learning_rate": 3.2256817769158657e-06, "loss": 0.82618099, "num_input_tokens_seen": 111185665, "step": 5178, "time_per_iteration": 4.142611742019653 }, { "auxiliary_loss_clip": 0.01119942, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.05289316, "balance_loss_mlp": 1.03076327, "epoch": 0.3113783255674132, "flos": 11838994888320.0, "grad_norm": 2.5880769767242633, "language_loss": 0.80990803, "learning_rate": 3.225373998592471e-06, "loss": 0.83158416, "num_input_tokens_seen": 111201615, "step": 5179, "time_per_iteration": 2.6429331302642822 }, { "auxiliary_loss_clip": 0.01112505, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.05353093, "balance_loss_mlp": 1.03139079, "epoch": 0.31143844882008115, "flos": 16289547684480.0, "grad_norm": 2.4201759029551813, "language_loss": 0.78532577, "learning_rate": 3.2250661738027715e-06, "loss": 0.80693662, "num_input_tokens_seen": 111220515, "step": 5180, "time_per_iteration": 4.1918723583221436 }, { "auxiliary_loss_clip": 0.01107686, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.05114985, "balance_loss_mlp": 1.02011788, "epoch": 0.3114985720727491, "flos": 23217792727680.0, "grad_norm": 1.6775849826612523, "language_loss": 0.83088589, "learning_rate": 3.22475830255844e-06, "loss": 0.85233486, "num_input_tokens_seen": 111240395, "step": 5181, "time_per_iteration": 2.760340929031372 }, { "auxiliary_loss_clip": 0.01110614, "auxiliary_loss_mlp": 0.01044232, "balance_loss_clip": 1.04879427, "balance_loss_mlp": 1.02881861, "epoch": 0.3115586953254171, "flos": 30044626698240.0, "grad_norm": 1.766790552230027, "language_loss": 0.74396992, "learning_rate": 3.2244503848711516e-06, "loss": 0.76551843, "num_input_tokens_seen": 111261100, "step": 5182, "time_per_iteration": 2.7501730918884277 }, { "auxiliary_loss_clip": 0.01093489, "auxiliary_loss_mlp": 0.00776946, "balance_loss_clip": 1.04811049, "balance_loss_mlp": 1.00152898, "epoch": 0.3116188185780851, "flos": 25666326109440.0, "grad_norm": 2.03695228940596, "language_loss": 0.70169222, "learning_rate": 3.2241424207525815e-06, "loss": 0.72039658, "num_input_tokens_seen": 111281320, "step": 5183, "time_per_iteration": 4.26041579246521 }, { "auxiliary_loss_clip": 0.01017812, "auxiliary_loss_mlp": 0.01006564, "balance_loss_clip": 1.01984847, "balance_loss_mlp": 1.00418019, "epoch": 0.31167894183075306, "flos": 69510058917120.0, "grad_norm": 0.9394459872440335, "language_loss": 0.59573013, "learning_rate": 3.223834410214408e-06, "loss": 0.61597383, "num_input_tokens_seen": 111341405, "step": 5184, "time_per_iteration": 4.992337226867676 }, { "auxiliary_loss_clip": 0.01115495, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.04588842, "balance_loss_mlp": 1.03422523, "epoch": 0.31173906508342103, "flos": 14939845211520.0, "grad_norm": 2.48453112640368, "language_loss": 0.70156622, "learning_rate": 3.223526353268311e-06, "loss": 0.72323, "num_input_tokens_seen": 111358975, "step": 5185, "time_per_iteration": 2.6406824588775635 }, { "auxiliary_loss_clip": 0.01122412, "auxiliary_loss_mlp": 0.01051261, "balance_loss_clip": 1.05447555, "balance_loss_mlp": 1.03405905, "epoch": 0.311799188336089, "flos": 16176033728640.0, "grad_norm": 2.8983279272522853, "language_loss": 0.63588691, "learning_rate": 3.2232182499259725e-06, "loss": 0.65762365, "num_input_tokens_seen": 111375845, "step": 5186, "time_per_iteration": 2.683971881866455 }, { "auxiliary_loss_clip": 0.01126858, "auxiliary_loss_mlp": 0.01049881, "balance_loss_clip": 1.05240881, "balance_loss_mlp": 1.03145099, "epoch": 0.31185931158875696, "flos": 25009627708800.0, "grad_norm": 2.2127415604209335, "language_loss": 0.86427295, "learning_rate": 3.2229101001990747e-06, "loss": 0.88604033, "num_input_tokens_seen": 111394150, "step": 5187, "time_per_iteration": 2.6983299255371094 }, { "auxiliary_loss_clip": 0.01146114, "auxiliary_loss_mlp": 0.0077496, "balance_loss_clip": 1.05417776, "balance_loss_mlp": 1.00131774, "epoch": 0.3119194348414249, "flos": 37232901273600.0, "grad_norm": 1.653121843679143, "language_loss": 0.63481069, "learning_rate": 3.2226019040993036e-06, "loss": 0.6540215, "num_input_tokens_seen": 111418355, "step": 5188, "time_per_iteration": 2.6974728107452393 }, { "auxiliary_loss_clip": 0.01106256, "auxiliary_loss_mlp": 0.01044626, "balance_loss_clip": 1.05064225, "balance_loss_mlp": 1.02799582, "epoch": 0.3119795580940929, "flos": 15012779777280.0, "grad_norm": 2.578497111530561, "language_loss": 0.83241487, "learning_rate": 3.222293661638346e-06, "loss": 0.85392368, "num_input_tokens_seen": 111435445, "step": 5189, "time_per_iteration": 2.6956889629364014 }, { "auxiliary_loss_clip": 0.01031008, "auxiliary_loss_mlp": 0.01045956, "balance_loss_clip": 1.03804195, "balance_loss_mlp": 1.02812243, "epoch": 0.31203968134676086, "flos": 15998168557440.0, "grad_norm": 1.8156368008577992, "language_loss": 0.79266763, "learning_rate": 3.22198537282789e-06, "loss": 0.81343722, "num_input_tokens_seen": 111453430, "step": 5190, "time_per_iteration": 3.0180671215057373 }, { "auxiliary_loss_clip": 0.01086186, "auxiliary_loss_mlp": 0.01053443, "balance_loss_clip": 1.04333639, "balance_loss_mlp": 1.03413141, "epoch": 0.3120998045994288, "flos": 23837359443840.0, "grad_norm": 1.571307617405072, "language_loss": 0.75174087, "learning_rate": 3.2216770376796262e-06, "loss": 0.77313721, "num_input_tokens_seen": 111475325, "step": 5191, "time_per_iteration": 3.0170204639434814 }, { "auxiliary_loss_clip": 0.01043661, "auxiliary_loss_mlp": 0.00755081, "balance_loss_clip": 1.02154636, "balance_loss_mlp": 1.00261629, "epoch": 0.3121599278520968, "flos": 69184205712000.0, "grad_norm": 0.8534965117798614, "language_loss": 0.63942307, "learning_rate": 3.221368656205247e-06, "loss": 0.6574105, "num_input_tokens_seen": 111533960, "step": 5192, "time_per_iteration": 3.288938045501709 }, { "auxiliary_loss_clip": 0.01133662, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.05246997, "balance_loss_mlp": 1.02569187, "epoch": 0.31222005110476475, "flos": 23806368984960.0, "grad_norm": 1.9226654053779162, "language_loss": 0.7976644, "learning_rate": 3.221060228416446e-06, "loss": 0.81943566, "num_input_tokens_seen": 111554055, "step": 5193, "time_per_iteration": 2.758859157562256 }, { "auxiliary_loss_clip": 0.01117628, "auxiliary_loss_mlp": 0.01054751, "balance_loss_clip": 1.04916263, "balance_loss_mlp": 1.03508139, "epoch": 0.3122801743574327, "flos": 25226132935680.0, "grad_norm": 2.5170295869133024, "language_loss": 0.72488689, "learning_rate": 3.2207517543249183e-06, "loss": 0.74661064, "num_input_tokens_seen": 111574305, "step": 5194, "time_per_iteration": 2.69765567779541 }, { "auxiliary_loss_clip": 0.01144699, "auxiliary_loss_mlp": 0.01044476, "balance_loss_clip": 1.05394197, "balance_loss_mlp": 1.02819204, "epoch": 0.3123402976101007, "flos": 22966490200320.0, "grad_norm": 1.775027795968239, "language_loss": 0.76423192, "learning_rate": 3.2204432339423616e-06, "loss": 0.78612363, "num_input_tokens_seen": 111595680, "step": 5195, "time_per_iteration": 2.665656566619873 }, { "auxiliary_loss_clip": 0.01144607, "auxiliary_loss_mlp": 0.01042079, "balance_loss_clip": 1.05148935, "balance_loss_mlp": 1.02544916, "epoch": 0.3124004208627687, "flos": 25192089820800.0, "grad_norm": 1.4414001308378115, "language_loss": 0.78089559, "learning_rate": 3.220134667280476e-06, "loss": 0.80276251, "num_input_tokens_seen": 111618135, "step": 5196, "time_per_iteration": 2.682476282119751 }, { "auxiliary_loss_clip": 0.01032618, "auxiliary_loss_mlp": 0.00755246, "balance_loss_clip": 1.02237272, "balance_loss_mlp": 1.00273037, "epoch": 0.31246054411543667, "flos": 67485165517440.0, "grad_norm": 0.794984063014186, "language_loss": 0.54770386, "learning_rate": 3.2198260543509613e-06, "loss": 0.56558245, "num_input_tokens_seen": 111682220, "step": 5197, "time_per_iteration": 3.24509334564209 }, { "auxiliary_loss_clip": 0.01144094, "auxiliary_loss_mlp": 0.01042495, "balance_loss_clip": 1.0547365, "balance_loss_mlp": 1.02586555, "epoch": 0.31252066736810463, "flos": 17858520731520.0, "grad_norm": 1.8260094290654212, "language_loss": 0.66137004, "learning_rate": 3.21951739516552e-06, "loss": 0.68323588, "num_input_tokens_seen": 111700815, "step": 5198, "time_per_iteration": 2.5970942974090576 }, { "auxiliary_loss_clip": 0.01102297, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.0459094, "balance_loss_mlp": 1.02898037, "epoch": 0.3125807906207726, "flos": 18475034791680.0, "grad_norm": 2.530729988117139, "language_loss": 0.6949119, "learning_rate": 3.219208689735857e-06, "loss": 0.71640968, "num_input_tokens_seen": 111718195, "step": 5199, "time_per_iteration": 2.6682288646698 }, { "auxiliary_loss_clip": 0.01132634, "auxiliary_loss_mlp": 0.01050152, "balance_loss_clip": 1.04906189, "balance_loss_mlp": 1.03258061, "epoch": 0.31264091387344056, "flos": 18946541646720.0, "grad_norm": 1.8087592578592666, "language_loss": 0.78480452, "learning_rate": 3.2188999380736785e-06, "loss": 0.8066324, "num_input_tokens_seen": 111734440, "step": 5200, "time_per_iteration": 2.6664814949035645 }, { "auxiliary_loss_clip": 0.01132139, "auxiliary_loss_mlp": 0.01037041, "balance_loss_clip": 1.05233109, "balance_loss_mlp": 1.02036345, "epoch": 0.3127010371261085, "flos": 21468512384640.0, "grad_norm": 2.0480479984687214, "language_loss": 0.83231741, "learning_rate": 3.2185911401906917e-06, "loss": 0.85400921, "num_input_tokens_seen": 111751960, "step": 5201, "time_per_iteration": 2.674558401107788 }, { "auxiliary_loss_clip": 0.01144703, "auxiliary_loss_mlp": 0.01045083, "balance_loss_clip": 1.05244124, "balance_loss_mlp": 1.02697527, "epoch": 0.3127611603787765, "flos": 15336047203200.0, "grad_norm": 3.6217323271444037, "language_loss": 0.6910159, "learning_rate": 3.2182822960986072e-06, "loss": 0.71291375, "num_input_tokens_seen": 111769585, "step": 5202, "time_per_iteration": 2.563164710998535 }, { "auxiliary_loss_clip": 0.01146715, "auxiliary_loss_mlp": 0.01041598, "balance_loss_clip": 1.05293012, "balance_loss_mlp": 1.02608871, "epoch": 0.31282128363144446, "flos": 17602980399360.0, "grad_norm": 1.898082303559049, "language_loss": 0.84124672, "learning_rate": 3.2179734058091358e-06, "loss": 0.86312985, "num_input_tokens_seen": 111787880, "step": 5203, "time_per_iteration": 2.6024506092071533 }, { "auxiliary_loss_clip": 0.01086755, "auxiliary_loss_mlp": 0.01049344, "balance_loss_clip": 1.04461396, "balance_loss_mlp": 1.03139079, "epoch": 0.3128814068841124, "flos": 26756753235840.0, "grad_norm": 2.246749233698224, "language_loss": 0.61165982, "learning_rate": 3.2176644693339913e-06, "loss": 0.63302082, "num_input_tokens_seen": 111805950, "step": 5204, "time_per_iteration": 2.748486042022705 }, { "auxiliary_loss_clip": 0.01105223, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.04439998, "balance_loss_mlp": 1.02722907, "epoch": 0.3129415301367804, "flos": 22272372806400.0, "grad_norm": 1.6432390116063589, "language_loss": 0.65875763, "learning_rate": 3.217355486684887e-06, "loss": 0.68024528, "num_input_tokens_seen": 111826135, "step": 5205, "time_per_iteration": 2.717499256134033 }, { "auxiliary_loss_clip": 0.01134026, "auxiliary_loss_mlp": 0.01046734, "balance_loss_clip": 1.05126929, "balance_loss_mlp": 1.02849531, "epoch": 0.31300165338944835, "flos": 26464907232000.0, "grad_norm": 1.6106510494401134, "language_loss": 0.76811433, "learning_rate": 3.2170464578735414e-06, "loss": 0.78992188, "num_input_tokens_seen": 111844700, "step": 5206, "time_per_iteration": 2.642439603805542 }, { "auxiliary_loss_clip": 0.01140688, "auxiliary_loss_mlp": 0.01041131, "balance_loss_clip": 1.04956853, "balance_loss_mlp": 1.02448893, "epoch": 0.3130617766421163, "flos": 21944652094080.0, "grad_norm": 2.214530025407602, "language_loss": 0.83204615, "learning_rate": 3.216737382911672e-06, "loss": 0.85386431, "num_input_tokens_seen": 111861585, "step": 5207, "time_per_iteration": 2.616652727127075 }, { "auxiliary_loss_clip": 0.01127002, "auxiliary_loss_mlp": 0.0104831, "balance_loss_clip": 1.0502398, "balance_loss_mlp": 1.0328126, "epoch": 0.3131218998947843, "flos": 23292774368640.0, "grad_norm": 1.5207985149404841, "language_loss": 0.71359724, "learning_rate": 3.216428261810999e-06, "loss": 0.73535037, "num_input_tokens_seen": 111882950, "step": 5208, "time_per_iteration": 2.674813747406006 }, { "auxiliary_loss_clip": 0.01120564, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.04862344, "balance_loss_mlp": 1.02827978, "epoch": 0.3131820231474523, "flos": 21139642437120.0, "grad_norm": 1.848256205390157, "language_loss": 0.74558908, "learning_rate": 3.2161190945832445e-06, "loss": 0.76724535, "num_input_tokens_seen": 111901640, "step": 5209, "time_per_iteration": 2.7193644046783447 }, { "auxiliary_loss_clip": 0.01140035, "auxiliary_loss_mlp": 0.01045727, "balance_loss_clip": 1.04733396, "balance_loss_mlp": 1.02937174, "epoch": 0.31324214640012027, "flos": 23909863046400.0, "grad_norm": 2.0633998475681135, "language_loss": 0.77254915, "learning_rate": 3.2158098812401325e-06, "loss": 0.79440677, "num_input_tokens_seen": 111919615, "step": 5210, "time_per_iteration": 2.6212270259857178 }, { "auxiliary_loss_clip": 0.01125553, "auxiliary_loss_mlp": 0.01039925, "balance_loss_clip": 1.047261, "balance_loss_mlp": 1.02385592, "epoch": 0.31330226965278823, "flos": 22236929061120.0, "grad_norm": 1.9577389211395706, "language_loss": 0.79128736, "learning_rate": 3.2155006217933874e-06, "loss": 0.81294215, "num_input_tokens_seen": 111938485, "step": 5211, "time_per_iteration": 2.6618316173553467 }, { "auxiliary_loss_clip": 0.01132257, "auxiliary_loss_mlp": 0.01042587, "balance_loss_clip": 1.05107522, "balance_loss_mlp": 1.02768588, "epoch": 0.3133623929054562, "flos": 19753993428480.0, "grad_norm": 2.4581961413264195, "language_loss": 0.79612064, "learning_rate": 3.2151913162547367e-06, "loss": 0.81786901, "num_input_tokens_seen": 111956425, "step": 5212, "time_per_iteration": 2.81793475151062 }, { "auxiliary_loss_clip": 0.01125931, "auxiliary_loss_mlp": 0.01053393, "balance_loss_clip": 1.05156052, "balance_loss_mlp": 1.03576159, "epoch": 0.31342251615812416, "flos": 27162256849920.0, "grad_norm": 2.69561664367352, "language_loss": 0.71024299, "learning_rate": 3.2148819646359097e-06, "loss": 0.73203623, "num_input_tokens_seen": 111975915, "step": 5213, "time_per_iteration": 2.6739485263824463 }, { "auxiliary_loss_clip": 0.01132672, "auxiliary_loss_mlp": 0.01045903, "balance_loss_clip": 1.05284989, "balance_loss_mlp": 1.02961898, "epoch": 0.31348263941079213, "flos": 20229809915520.0, "grad_norm": 1.9828215257111186, "language_loss": 0.77684069, "learning_rate": 3.2145725669486374e-06, "loss": 0.79862642, "num_input_tokens_seen": 111995055, "step": 5214, "time_per_iteration": 2.6108171939849854 }, { "auxiliary_loss_clip": 0.01099316, "auxiliary_loss_mlp": 0.01038553, "balance_loss_clip": 1.0522778, "balance_loss_mlp": 1.02317524, "epoch": 0.3135427626634601, "flos": 24607643627520.0, "grad_norm": 2.2634840816113075, "language_loss": 0.8300609, "learning_rate": 3.2142631232046517e-06, "loss": 0.8514396, "num_input_tokens_seen": 112015830, "step": 5215, "time_per_iteration": 2.77897047996521 }, { "auxiliary_loss_clip": 0.01131919, "auxiliary_loss_mlp": 0.01040929, "balance_loss_clip": 1.05089617, "balance_loss_mlp": 1.02375078, "epoch": 0.31360288591612806, "flos": 20959873845120.0, "grad_norm": 2.280765330466862, "language_loss": 0.79540187, "learning_rate": 3.213953633415686e-06, "loss": 0.81713033, "num_input_tokens_seen": 112035065, "step": 5216, "time_per_iteration": 2.675492763519287 }, { "auxiliary_loss_clip": 0.01119434, "auxiliary_loss_mlp": 0.01049814, "balance_loss_clip": 1.04817545, "balance_loss_mlp": 1.03174222, "epoch": 0.313663009168796, "flos": 26980513009920.0, "grad_norm": 1.97082305961493, "language_loss": 0.69007474, "learning_rate": 3.213644097593477e-06, "loss": 0.7117672, "num_input_tokens_seen": 112058405, "step": 5217, "time_per_iteration": 2.7360196113586426 }, { "auxiliary_loss_clip": 0.01121348, "auxiliary_loss_mlp": 0.01038659, "balance_loss_clip": 1.04833519, "balance_loss_mlp": 1.02275062, "epoch": 0.313723132421464, "flos": 18040911016320.0, "grad_norm": 1.7253432561329243, "language_loss": 0.81228399, "learning_rate": 3.2133345157497624e-06, "loss": 0.83388406, "num_input_tokens_seen": 112076420, "step": 5218, "time_per_iteration": 4.393778562545776 }, { "auxiliary_loss_clip": 0.01139073, "auxiliary_loss_mlp": 0.01041023, "balance_loss_clip": 1.04819143, "balance_loss_mlp": 1.02422082, "epoch": 0.31378325567413196, "flos": 22488913946880.0, "grad_norm": 2.6452768271158167, "language_loss": 0.69128895, "learning_rate": 3.2130248878962813e-06, "loss": 0.71308994, "num_input_tokens_seen": 112090775, "step": 5219, "time_per_iteration": 4.162578344345093 }, { "auxiliary_loss_clip": 0.01117748, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.04879618, "balance_loss_mlp": 1.0287652, "epoch": 0.3138433789267999, "flos": 22419247518720.0, "grad_norm": 5.057996341652072, "language_loss": 0.80019122, "learning_rate": 3.2127152140447747e-06, "loss": 0.82181168, "num_input_tokens_seen": 112110980, "step": 5220, "time_per_iteration": 2.693300247192383 }, { "auxiliary_loss_clip": 0.01133002, "auxiliary_loss_mlp": 0.01038024, "balance_loss_clip": 1.05214572, "balance_loss_mlp": 1.0220139, "epoch": 0.3139035021794679, "flos": 13005912026880.0, "grad_norm": 1.7918234828134079, "language_loss": 0.72575235, "learning_rate": 3.212405494206986e-06, "loss": 0.74746263, "num_input_tokens_seen": 112129020, "step": 5221, "time_per_iteration": 2.6918861865997314 }, { "auxiliary_loss_clip": 0.01105754, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.04538214, "balance_loss_mlp": 1.02435017, "epoch": 0.31396362543213585, "flos": 16945994689920.0, "grad_norm": 1.7850671432610508, "language_loss": 0.82097268, "learning_rate": 3.2120957283946588e-06, "loss": 0.84243071, "num_input_tokens_seen": 112147865, "step": 5222, "time_per_iteration": 4.193262100219727 }, { "auxiliary_loss_clip": 0.01136096, "auxiliary_loss_mlp": 0.01044943, "balance_loss_clip": 1.05302894, "balance_loss_mlp": 1.02764595, "epoch": 0.31402374868480387, "flos": 20156731695360.0, "grad_norm": 2.3946225731958073, "language_loss": 0.70159894, "learning_rate": 3.2117859166195407e-06, "loss": 0.7234093, "num_input_tokens_seen": 112166745, "step": 5223, "time_per_iteration": 2.642608642578125 }, { "auxiliary_loss_clip": 0.01120375, "auxiliary_loss_mlp": 0.00773089, "balance_loss_clip": 1.04545665, "balance_loss_mlp": 1.0012387, "epoch": 0.31408387193747184, "flos": 21251073404160.0, "grad_norm": 1.5662600408509175, "language_loss": 0.80818307, "learning_rate": 3.211476058893379e-06, "loss": 0.82711768, "num_input_tokens_seen": 112185895, "step": 5224, "time_per_iteration": 4.334134101867676 }, { "auxiliary_loss_clip": 0.0113849, "auxiliary_loss_mlp": 0.01044903, "balance_loss_clip": 1.05376673, "balance_loss_mlp": 1.02807033, "epoch": 0.3141439951901398, "flos": 27484267299840.0, "grad_norm": 2.581635190586104, "language_loss": 0.57647121, "learning_rate": 3.2111661552279243e-06, "loss": 0.59830517, "num_input_tokens_seen": 112204465, "step": 5225, "time_per_iteration": 2.680227041244507 }, { "auxiliary_loss_clip": 0.01086502, "auxiliary_loss_mlp": 0.01032759, "balance_loss_clip": 1.04252625, "balance_loss_mlp": 1.0179472, "epoch": 0.31420411844280777, "flos": 17852235851520.0, "grad_norm": 2.0500851879408577, "language_loss": 0.81726074, "learning_rate": 3.2108562056349273e-06, "loss": 0.83845341, "num_input_tokens_seen": 112221635, "step": 5226, "time_per_iteration": 2.8080878257751465 }, { "auxiliary_loss_clip": 0.01123539, "auxiliary_loss_mlp": 0.01053238, "balance_loss_clip": 1.04718053, "balance_loss_mlp": 1.03557122, "epoch": 0.31426424169547573, "flos": 21616967295360.0, "grad_norm": 1.8156350578732643, "language_loss": 0.7435357, "learning_rate": 3.210546210126141e-06, "loss": 0.76530349, "num_input_tokens_seen": 112241240, "step": 5227, "time_per_iteration": 2.6420040130615234 }, { "auxiliary_loss_clip": 0.01128154, "auxiliary_loss_mlp": 0.01036288, "balance_loss_clip": 1.05315053, "balance_loss_mlp": 1.01981306, "epoch": 0.3143243649481437, "flos": 30920631586560.0, "grad_norm": 1.9798889840887306, "language_loss": 0.6779027, "learning_rate": 3.2102361687133213e-06, "loss": 0.69954711, "num_input_tokens_seen": 112262350, "step": 5228, "time_per_iteration": 2.6904454231262207 }, { "auxiliary_loss_clip": 0.01116854, "auxiliary_loss_mlp": 0.01042698, "balance_loss_clip": 1.04812217, "balance_loss_mlp": 1.02755868, "epoch": 0.31438448820081166, "flos": 22821411168000.0, "grad_norm": 2.2592581290101648, "language_loss": 0.802086, "learning_rate": 3.2099260814082254e-06, "loss": 0.82368147, "num_input_tokens_seen": 112283710, "step": 5229, "time_per_iteration": 2.720972776412964 }, { "auxiliary_loss_clip": 0.01116185, "auxiliary_loss_mlp": 0.01034979, "balance_loss_clip": 1.04888391, "balance_loss_mlp": 1.01917148, "epoch": 0.3144446114534796, "flos": 23292127923840.0, "grad_norm": 2.206396959728329, "language_loss": 0.69972271, "learning_rate": 3.209615948222611e-06, "loss": 0.72123438, "num_input_tokens_seen": 112304285, "step": 5230, "time_per_iteration": 2.69555401802063 }, { "auxiliary_loss_clip": 0.01094216, "auxiliary_loss_mlp": 0.01051308, "balance_loss_clip": 1.042889, "balance_loss_mlp": 1.03331971, "epoch": 0.3145047347061476, "flos": 31355976424320.0, "grad_norm": 11.083232715551919, "language_loss": 0.79441226, "learning_rate": 3.209305769168239e-06, "loss": 0.81586754, "num_input_tokens_seen": 112325110, "step": 5231, "time_per_iteration": 2.742414712905884 }, { "auxiliary_loss_clip": 0.01111136, "auxiliary_loss_mlp": 0.01044032, "balance_loss_clip": 1.05004621, "balance_loss_mlp": 1.02751017, "epoch": 0.31456485795881556, "flos": 10889552643840.0, "grad_norm": 68.21693219117104, "language_loss": 0.84846044, "learning_rate": 3.2089955442568704e-06, "loss": 0.87001216, "num_input_tokens_seen": 112339855, "step": 5232, "time_per_iteration": 2.681541919708252 }, { "auxiliary_loss_clip": 0.01082351, "auxiliary_loss_mlp": 0.01063678, "balance_loss_clip": 1.04169703, "balance_loss_mlp": 1.04589176, "epoch": 0.3146249812114835, "flos": 17092438439040.0, "grad_norm": 1.732593505271442, "language_loss": 0.79899549, "learning_rate": 3.2086852735002692e-06, "loss": 0.82045579, "num_input_tokens_seen": 112358480, "step": 5233, "time_per_iteration": 2.7261524200439453 }, { "auxiliary_loss_clip": 0.01095476, "auxiliary_loss_mlp": 0.01043701, "balance_loss_clip": 1.04795146, "balance_loss_mlp": 1.02775121, "epoch": 0.3146851044641515, "flos": 55291442889600.0, "grad_norm": 1.8884411146751285, "language_loss": 0.71124369, "learning_rate": 3.2083749569102024e-06, "loss": 0.73263544, "num_input_tokens_seen": 112382350, "step": 5234, "time_per_iteration": 3.0071427822113037 }, { "auxiliary_loss_clip": 0.01105209, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.05008078, "balance_loss_mlp": 1.02060878, "epoch": 0.31474522771681945, "flos": 27015884928000.0, "grad_norm": 2.1537517260325396, "language_loss": 0.72106552, "learning_rate": 3.2080645944984356e-06, "loss": 0.74248433, "num_input_tokens_seen": 112400260, "step": 5235, "time_per_iteration": 2.7347464561462402 }, { "auxiliary_loss_clip": 0.011281, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.0479089, "balance_loss_mlp": 1.0225656, "epoch": 0.3148053509694875, "flos": 21251935330560.0, "grad_norm": 2.047935998004664, "language_loss": 0.78640145, "learning_rate": 3.2077541862767384e-06, "loss": 0.80806667, "num_input_tokens_seen": 112419400, "step": 5236, "time_per_iteration": 2.6480181217193604 }, { "auxiliary_loss_clip": 0.01142531, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04929006, "balance_loss_mlp": 1.02536416, "epoch": 0.31486547422215544, "flos": 31248675521280.0, "grad_norm": 1.8469097199945863, "language_loss": 0.75903904, "learning_rate": 3.207443732256881e-06, "loss": 0.78088653, "num_input_tokens_seen": 112440825, "step": 5237, "time_per_iteration": 2.7113847732543945 }, { "auxiliary_loss_clip": 0.01133953, "auxiliary_loss_mlp": 0.01035749, "balance_loss_clip": 1.04817045, "balance_loss_mlp": 1.02128255, "epoch": 0.3149255974748234, "flos": 19828615933440.0, "grad_norm": 2.176202072112168, "language_loss": 0.79725033, "learning_rate": 3.2071332324506372e-06, "loss": 0.81894737, "num_input_tokens_seen": 112459180, "step": 5238, "time_per_iteration": 2.649968147277832 }, { "auxiliary_loss_clip": 0.01046118, "auxiliary_loss_mlp": 0.01018852, "balance_loss_clip": 1.02561212, "balance_loss_mlp": 1.01676548, "epoch": 0.31498572072749137, "flos": 67683965339520.0, "grad_norm": 0.8324046464960934, "language_loss": 0.67913729, "learning_rate": 3.2068226868697795e-06, "loss": 0.69978696, "num_input_tokens_seen": 112516680, "step": 5239, "time_per_iteration": 3.130643606185913 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.01043617, "balance_loss_clip": 1.04828835, "balance_loss_mlp": 1.02528274, "epoch": 0.31504584398015933, "flos": 19793136274560.0, "grad_norm": 2.4702861290170235, "language_loss": 0.82906926, "learning_rate": 3.2065120955260846e-06, "loss": 0.85072124, "num_input_tokens_seen": 112535895, "step": 5240, "time_per_iteration": 2.6314027309417725 }, { "auxiliary_loss_clip": 0.0111196, "auxiliary_loss_mlp": 0.0077379, "balance_loss_clip": 1.04708409, "balance_loss_mlp": 1.00132334, "epoch": 0.3151059672328273, "flos": 26615409217920.0, "grad_norm": 1.6854261536361361, "language_loss": 0.81405544, "learning_rate": 3.2062014584313302e-06, "loss": 0.83291298, "num_input_tokens_seen": 112557490, "step": 5241, "time_per_iteration": 2.7245657444000244 }, { "auxiliary_loss_clip": 0.01138561, "auxiliary_loss_mlp": 0.01038584, "balance_loss_clip": 1.05094576, "balance_loss_mlp": 1.0230633, "epoch": 0.31516609048549526, "flos": 24204438483840.0, "grad_norm": 1.7554610875937957, "language_loss": 0.74513441, "learning_rate": 3.2058907755972956e-06, "loss": 0.7669059, "num_input_tokens_seen": 112577075, "step": 5242, "time_per_iteration": 2.5925803184509277 }, { "auxiliary_loss_clip": 0.01106752, "auxiliary_loss_mlp": 0.01039069, "balance_loss_clip": 1.04686832, "balance_loss_mlp": 1.02230775, "epoch": 0.31522621373816323, "flos": 25958710817280.0, "grad_norm": 12.905078117761404, "language_loss": 0.73457384, "learning_rate": 3.2055800470357626e-06, "loss": 0.75603199, "num_input_tokens_seen": 112597620, "step": 5243, "time_per_iteration": 2.721261739730835 }, { "auxiliary_loss_clip": 0.01126602, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.04783881, "balance_loss_mlp": 1.02524936, "epoch": 0.3152863369908312, "flos": 21908813299200.0, "grad_norm": 2.079273463581607, "language_loss": 0.6462577, "learning_rate": 3.205269272758513e-06, "loss": 0.66793752, "num_input_tokens_seen": 112617150, "step": 5244, "time_per_iteration": 2.6753153800964355 }, { "auxiliary_loss_clip": 0.01087107, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.04454994, "balance_loss_mlp": 1.02158141, "epoch": 0.31534646024349916, "flos": 16281072074880.0, "grad_norm": 2.126512737541558, "language_loss": 0.91117549, "learning_rate": 3.2049584527773313e-06, "loss": 0.93242127, "num_input_tokens_seen": 112631090, "step": 5245, "time_per_iteration": 2.717316150665283 }, { "auxiliary_loss_clip": 0.01129236, "auxiliary_loss_mlp": 0.01046116, "balance_loss_clip": 1.04892504, "balance_loss_mlp": 1.02911687, "epoch": 0.3154065834961671, "flos": 24717243000960.0, "grad_norm": 2.0341104694483296, "language_loss": 0.75199413, "learning_rate": 3.2046475871040048e-06, "loss": 0.77374756, "num_input_tokens_seen": 112651220, "step": 5246, "time_per_iteration": 2.738969564437866 }, { "auxiliary_loss_clip": 0.01139621, "auxiliary_loss_mlp": 0.01044826, "balance_loss_clip": 1.04860735, "balance_loss_mlp": 1.027946, "epoch": 0.3154667067488351, "flos": 35371148469120.0, "grad_norm": 1.7161631839732394, "language_loss": 0.61524433, "learning_rate": 3.204336675750321e-06, "loss": 0.63708878, "num_input_tokens_seen": 112671560, "step": 5247, "time_per_iteration": 2.714258909225464 }, { "auxiliary_loss_clip": 0.01129569, "auxiliary_loss_mlp": 0.0104508, "balance_loss_clip": 1.04842138, "balance_loss_mlp": 1.0283072, "epoch": 0.31552683000150306, "flos": 17456464823040.0, "grad_norm": 2.438581052681848, "language_loss": 0.82096362, "learning_rate": 3.2040257187280693e-06, "loss": 0.84271014, "num_input_tokens_seen": 112689790, "step": 5248, "time_per_iteration": 2.6235198974609375 }, { "auxiliary_loss_clip": 0.01121718, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.04964209, "balance_loss_mlp": 1.0292145, "epoch": 0.3155869532541711, "flos": 18405763413120.0, "grad_norm": 5.654706808285272, "language_loss": 0.84601712, "learning_rate": 3.2037147160490423e-06, "loss": 0.86770785, "num_input_tokens_seen": 112708265, "step": 5249, "time_per_iteration": 2.664454698562622 }, { "auxiliary_loss_clip": 0.01105599, "auxiliary_loss_mlp": 0.01040266, "balance_loss_clip": 1.04724038, "balance_loss_mlp": 1.02252758, "epoch": 0.31564707650683904, "flos": 21579763783680.0, "grad_norm": 2.1333510394712034, "language_loss": 0.85412121, "learning_rate": 3.2034036677250322e-06, "loss": 0.87557989, "num_input_tokens_seen": 112727820, "step": 5250, "time_per_iteration": 2.7892768383026123 }, { "auxiliary_loss_clip": 0.01110748, "auxiliary_loss_mlp": 0.01044305, "balance_loss_clip": 1.04626083, "balance_loss_mlp": 1.02721059, "epoch": 0.315707199759507, "flos": 21030976817280.0, "grad_norm": 3.250818956981283, "language_loss": 0.68651402, "learning_rate": 3.203092573767835e-06, "loss": 0.70806456, "num_input_tokens_seen": 112743140, "step": 5251, "time_per_iteration": 2.660738468170166 }, { "auxiliary_loss_clip": 0.01141131, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.05063367, "balance_loss_mlp": 1.02374566, "epoch": 0.31576732301217497, "flos": 26828861788800.0, "grad_norm": 1.6959923935223091, "language_loss": 0.79367268, "learning_rate": 3.202781434189246e-06, "loss": 0.81549257, "num_input_tokens_seen": 112764705, "step": 5252, "time_per_iteration": 2.6600146293640137 }, { "auxiliary_loss_clip": 0.01123952, "auxiliary_loss_mlp": 0.01055554, "balance_loss_clip": 1.04919744, "balance_loss_mlp": 1.03742182, "epoch": 0.31582744626484294, "flos": 22711165349760.0, "grad_norm": 1.5850214403847396, "language_loss": 0.74167955, "learning_rate": 3.202470249001066e-06, "loss": 0.76347458, "num_input_tokens_seen": 112785310, "step": 5253, "time_per_iteration": 2.6831557750701904 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.04685211, "balance_loss_mlp": 1.02571261, "epoch": 0.3158875695175109, "flos": 23951914894080.0, "grad_norm": 1.8578399335985847, "language_loss": 0.73295557, "learning_rate": 3.2021590182150924e-06, "loss": 0.75456059, "num_input_tokens_seen": 112802905, "step": 5254, "time_per_iteration": 2.664445161819458 }, { "auxiliary_loss_clip": 0.0112999, "auxiliary_loss_mlp": 0.0104166, "balance_loss_clip": 1.04998255, "balance_loss_mlp": 1.02442837, "epoch": 0.31594769277017887, "flos": 13261883322240.0, "grad_norm": 1.9116991379626416, "language_loss": 0.77497417, "learning_rate": 3.201847741843128e-06, "loss": 0.7966907, "num_input_tokens_seen": 112820305, "step": 5255, "time_per_iteration": 2.5817084312438965 }, { "auxiliary_loss_clip": 0.01116092, "auxiliary_loss_mlp": 0.01045862, "balance_loss_clip": 1.0481391, "balance_loss_mlp": 1.02718151, "epoch": 0.31600781602284683, "flos": 23368258800000.0, "grad_norm": 2.396272573281143, "language_loss": 0.7821492, "learning_rate": 3.2015364198969772e-06, "loss": 0.80376875, "num_input_tokens_seen": 112841185, "step": 5256, "time_per_iteration": 2.6798577308654785 }, { "auxiliary_loss_clip": 0.0109858, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.04874921, "balance_loss_mlp": 1.02676511, "epoch": 0.3160679392755148, "flos": 19828580019840.0, "grad_norm": 1.575034121408654, "language_loss": 0.71175283, "learning_rate": 3.2012250523884453e-06, "loss": 0.73316103, "num_input_tokens_seen": 112860570, "step": 5257, "time_per_iteration": 4.252342462539673 }, { "auxiliary_loss_clip": 0.01132481, "auxiliary_loss_mlp": 0.01043271, "balance_loss_clip": 1.05120182, "balance_loss_mlp": 1.02524674, "epoch": 0.31612806252818276, "flos": 20193216935040.0, "grad_norm": 2.0196036815267036, "language_loss": 0.76539034, "learning_rate": 3.2009136393293393e-06, "loss": 0.78714788, "num_input_tokens_seen": 112877975, "step": 5258, "time_per_iteration": 4.240477085113525 }, { "auxiliary_loss_clip": 0.01110908, "auxiliary_loss_mlp": 0.01047088, "balance_loss_clip": 1.04727268, "balance_loss_mlp": 1.02917099, "epoch": 0.31618818578085073, "flos": 24235967646720.0, "grad_norm": 3.2354010090655403, "language_loss": 0.72901475, "learning_rate": 3.200602180731467e-06, "loss": 0.75059474, "num_input_tokens_seen": 112896170, "step": 5259, "time_per_iteration": 2.726944923400879 }, { "auxiliary_loss_clip": 0.01117115, "auxiliary_loss_mlp": 0.00776982, "balance_loss_clip": 1.04983401, "balance_loss_mlp": 1.0013001, "epoch": 0.3162483090335187, "flos": 25081844002560.0, "grad_norm": 2.1961272089612307, "language_loss": 0.66124642, "learning_rate": 3.20029067660664e-06, "loss": 0.68018734, "num_input_tokens_seen": 112916180, "step": 5260, "time_per_iteration": 2.7605621814727783 }, { "auxiliary_loss_clip": 0.01130372, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.04645884, "balance_loss_mlp": 1.02016842, "epoch": 0.31630843228618666, "flos": 26323383646080.0, "grad_norm": 1.8277182943015604, "language_loss": 0.71989, "learning_rate": 3.1999791269666706e-06, "loss": 0.74156475, "num_input_tokens_seen": 112936745, "step": 5261, "time_per_iteration": 4.231431484222412 }, { "auxiliary_loss_clip": 0.01044321, "auxiliary_loss_mlp": 0.01007323, "balance_loss_clip": 1.02311194, "balance_loss_mlp": 1.00424767, "epoch": 0.3163685555388547, "flos": 66758441552640.0, "grad_norm": 0.7429950107461195, "language_loss": 0.50646758, "learning_rate": 3.1996675318233716e-06, "loss": 0.5269841, "num_input_tokens_seen": 112994845, "step": 5262, "time_per_iteration": 3.232384443283081 }, { "auxiliary_loss_clip": 0.01131333, "auxiliary_loss_mlp": 0.01046761, "balance_loss_clip": 1.05222106, "balance_loss_mlp": 1.02932084, "epoch": 0.31642867879152264, "flos": 25995662933760.0, "grad_norm": 1.5863649349069382, "language_loss": 0.85187083, "learning_rate": 3.19935589118856e-06, "loss": 0.8736518, "num_input_tokens_seen": 113015125, "step": 5263, "time_per_iteration": 4.33522629737854 }, { "auxiliary_loss_clip": 0.01112644, "auxiliary_loss_mlp": 0.01048382, "balance_loss_clip": 1.04875994, "balance_loss_mlp": 1.03256297, "epoch": 0.3164888020441906, "flos": 25774955815680.0, "grad_norm": 1.550008856477613, "language_loss": 0.81648135, "learning_rate": 3.1990442050740535e-06, "loss": 0.83809161, "num_input_tokens_seen": 113035535, "step": 5264, "time_per_iteration": 2.8155312538146973 }, { "auxiliary_loss_clip": 0.01121259, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04812968, "balance_loss_mlp": 1.02431464, "epoch": 0.3165489252968586, "flos": 19756220071680.0, "grad_norm": 2.234025317189389, "language_loss": 0.78969181, "learning_rate": 3.19873247349167e-06, "loss": 0.81132656, "num_input_tokens_seen": 113052720, "step": 5265, "time_per_iteration": 2.6533524990081787 }, { "auxiliary_loss_clip": 0.0113452, "auxiliary_loss_mlp": 0.01049591, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.03144741, "epoch": 0.31660904854952654, "flos": 23183929180800.0, "grad_norm": 1.789116232573577, "language_loss": 0.74705631, "learning_rate": 3.1984206964532307e-06, "loss": 0.76889741, "num_input_tokens_seen": 113071435, "step": 5266, "time_per_iteration": 2.66683292388916 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.0104338, "balance_loss_clip": 1.04636073, "balance_loss_mlp": 1.02660751, "epoch": 0.3166691718021945, "flos": 20408501099520.0, "grad_norm": 2.507852328081816, "language_loss": 0.79178059, "learning_rate": 3.1981088739705585e-06, "loss": 0.81329834, "num_input_tokens_seen": 113088645, "step": 5267, "time_per_iteration": 2.6870310306549072 }, { "auxiliary_loss_clip": 0.0103642, "auxiliary_loss_mlp": 0.01002482, "balance_loss_clip": 1.02563763, "balance_loss_mlp": 1.00002623, "epoch": 0.31672929505486247, "flos": 70144781172480.0, "grad_norm": 0.7343006553516018, "language_loss": 0.57840127, "learning_rate": 3.197797006055478e-06, "loss": 0.59879029, "num_input_tokens_seen": 113152775, "step": 5268, "time_per_iteration": 3.211494207382202 }, { "auxiliary_loss_clip": 0.01144761, "auxiliary_loss_mlp": 0.01044165, "balance_loss_clip": 1.0517385, "balance_loss_mlp": 1.02729666, "epoch": 0.31678941830753043, "flos": 14355758154240.0, "grad_norm": 2.2657818682072146, "language_loss": 0.73009932, "learning_rate": 3.197485092719815e-06, "loss": 0.75198865, "num_input_tokens_seen": 113171410, "step": 5269, "time_per_iteration": 2.5840115547180176 }, { "auxiliary_loss_clip": 0.01108492, "auxiliary_loss_mlp": 0.01049824, "balance_loss_clip": 1.0489136, "balance_loss_mlp": 1.03283644, "epoch": 0.3168495415601984, "flos": 22747722416640.0, "grad_norm": 2.2273308320264995, "language_loss": 0.79972744, "learning_rate": 3.1971731339753973e-06, "loss": 0.82131052, "num_input_tokens_seen": 113189965, "step": 5270, "time_per_iteration": 2.858154535293579 }, { "auxiliary_loss_clip": 0.01146892, "auxiliary_loss_mlp": 0.01050124, "balance_loss_clip": 1.05206418, "balance_loss_mlp": 1.03207529, "epoch": 0.31690966481286637, "flos": 20115254465280.0, "grad_norm": 9.25747726986636, "language_loss": 0.7941646, "learning_rate": 3.1968611298340545e-06, "loss": 0.81613475, "num_input_tokens_seen": 113206355, "step": 5271, "time_per_iteration": 2.6510884761810303 }, { "auxiliary_loss_clip": 0.01144344, "auxiliary_loss_mlp": 0.01040088, "balance_loss_clip": 1.05230093, "balance_loss_mlp": 1.02269578, "epoch": 0.31696978806553433, "flos": 21178928937600.0, "grad_norm": 1.806612869692892, "language_loss": 0.72429144, "learning_rate": 3.1965490803076173e-06, "loss": 0.74613577, "num_input_tokens_seen": 113225440, "step": 5272, "time_per_iteration": 2.6807363033294678 }, { "auxiliary_loss_clip": 0.01123855, "auxiliary_loss_mlp": 0.01052611, "balance_loss_clip": 1.04942703, "balance_loss_mlp": 1.03365636, "epoch": 0.3170299113182023, "flos": 42997030439040.0, "grad_norm": 2.241731745129767, "language_loss": 0.69146693, "learning_rate": 3.1962369854079194e-06, "loss": 0.71323156, "num_input_tokens_seen": 113248840, "step": 5273, "time_per_iteration": 2.9202728271484375 }, { "auxiliary_loss_clip": 0.01128467, "auxiliary_loss_mlp": 0.00775845, "balance_loss_clip": 1.04869509, "balance_loss_mlp": 1.00146461, "epoch": 0.31709003457087026, "flos": 24460158384000.0, "grad_norm": 1.872718303622414, "language_loss": 0.67764306, "learning_rate": 3.195924845146795e-06, "loss": 0.69668615, "num_input_tokens_seen": 113269630, "step": 5274, "time_per_iteration": 2.6541714668273926 }, { "auxiliary_loss_clip": 0.01092683, "auxiliary_loss_mlp": 0.0106112, "balance_loss_clip": 1.04346347, "balance_loss_mlp": 1.04305935, "epoch": 0.3171501578235382, "flos": 24135310759680.0, "grad_norm": 1.7402048894999724, "language_loss": 0.80815518, "learning_rate": 3.195612659536081e-06, "loss": 0.8296932, "num_input_tokens_seen": 113291200, "step": 5275, "time_per_iteration": 2.840696096420288 }, { "auxiliary_loss_clip": 0.0113287, "auxiliary_loss_mlp": 0.01047853, "balance_loss_clip": 1.04862475, "balance_loss_mlp": 1.02979279, "epoch": 0.31721028107620625, "flos": 18879712392960.0, "grad_norm": 2.28886723118271, "language_loss": 0.72418922, "learning_rate": 3.1953004285876147e-06, "loss": 0.74599648, "num_input_tokens_seen": 113310170, "step": 5276, "time_per_iteration": 2.6426591873168945 }, { "auxiliary_loss_clip": 0.01122606, "auxiliary_loss_mlp": 0.01041381, "balance_loss_clip": 1.05439019, "balance_loss_mlp": 1.02588356, "epoch": 0.3172704043288742, "flos": 23147874904320.0, "grad_norm": 1.4542936031710312, "language_loss": 0.77923822, "learning_rate": 3.194988152313236e-06, "loss": 0.80087811, "num_input_tokens_seen": 113331140, "step": 5277, "time_per_iteration": 2.7192864418029785 }, { "auxiliary_loss_clip": 0.01113098, "auxiliary_loss_mlp": 0.01054598, "balance_loss_clip": 1.04708886, "balance_loss_mlp": 1.03432024, "epoch": 0.3173305275815422, "flos": 17858520731520.0, "grad_norm": 2.071832444797603, "language_loss": 0.79029107, "learning_rate": 3.1946758307247878e-06, "loss": 0.81196797, "num_input_tokens_seen": 113350030, "step": 5278, "time_per_iteration": 2.606973648071289 }, { "auxiliary_loss_clip": 0.01041198, "auxiliary_loss_mlp": 0.01006121, "balance_loss_clip": 1.02207565, "balance_loss_mlp": 1.00391531, "epoch": 0.31739065083421014, "flos": 59973476883840.0, "grad_norm": 0.8783580735908582, "language_loss": 0.62817574, "learning_rate": 3.1943634638341114e-06, "loss": 0.64864898, "num_input_tokens_seen": 113395820, "step": 5279, "time_per_iteration": 2.998594284057617 }, { "auxiliary_loss_clip": 0.01146927, "auxiliary_loss_mlp": 0.01055699, "balance_loss_clip": 1.05080009, "balance_loss_mlp": 1.03651857, "epoch": 0.3174507740868781, "flos": 23800981944960.0, "grad_norm": 1.4881688285488497, "language_loss": 0.80855167, "learning_rate": 3.194051051653053e-06, "loss": 0.83057791, "num_input_tokens_seen": 113416835, "step": 5280, "time_per_iteration": 2.662240743637085 }, { "auxiliary_loss_clip": 0.0110603, "auxiliary_loss_mlp": 0.01050191, "balance_loss_clip": 1.04850507, "balance_loss_mlp": 1.0339663, "epoch": 0.31751089733954607, "flos": 27638899349760.0, "grad_norm": 1.6411021360183768, "language_loss": 0.77964067, "learning_rate": 3.19373859419346e-06, "loss": 0.80120289, "num_input_tokens_seen": 113440850, "step": 5281, "time_per_iteration": 2.8303840160369873 }, { "auxiliary_loss_clip": 0.01119054, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.04812443, "balance_loss_mlp": 1.02194262, "epoch": 0.31757102059221404, "flos": 23769273214080.0, "grad_norm": 2.6184534699054116, "language_loss": 0.78539747, "learning_rate": 3.193426091467179e-06, "loss": 0.80698353, "num_input_tokens_seen": 113461000, "step": 5282, "time_per_iteration": 2.75915265083313 }, { "auxiliary_loss_clip": 0.01122553, "auxiliary_loss_mlp": 0.01050996, "balance_loss_clip": 1.0517695, "balance_loss_mlp": 1.03284001, "epoch": 0.317631143844882, "flos": 25264521596160.0, "grad_norm": 1.8901773671102746, "language_loss": 0.67857707, "learning_rate": 3.193113543486061e-06, "loss": 0.70031261, "num_input_tokens_seen": 113480820, "step": 5283, "time_per_iteration": 2.710601329803467 }, { "auxiliary_loss_clip": 0.01039071, "auxiliary_loss_mlp": 0.01003581, "balance_loss_clip": 1.02084279, "balance_loss_mlp": 1.00145948, "epoch": 0.31769126709754997, "flos": 55825939221120.0, "grad_norm": 0.7284643981615322, "language_loss": 0.52787578, "learning_rate": 3.192800950261958e-06, "loss": 0.54830229, "num_input_tokens_seen": 113536910, "step": 5284, "time_per_iteration": 3.1312994956970215 }, { "auxiliary_loss_clip": 0.01123508, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.05256152, "balance_loss_mlp": 1.02529633, "epoch": 0.31775139035021793, "flos": 16690562098560.0, "grad_norm": 1.6358492252526933, "language_loss": 0.70703542, "learning_rate": 3.1924883118067235e-06, "loss": 0.72868699, "num_input_tokens_seen": 113555480, "step": 5285, "time_per_iteration": 2.66414213180542 }, { "auxiliary_loss_clip": 0.01051594, "auxiliary_loss_mlp": 0.01001353, "balance_loss_clip": 1.02112103, "balance_loss_mlp": 0.99919558, "epoch": 0.3178115136028859, "flos": 64227241019520.0, "grad_norm": 0.8795363824150627, "language_loss": 0.60495377, "learning_rate": 3.1921756281322123e-06, "loss": 0.62548316, "num_input_tokens_seen": 113616790, "step": 5286, "time_per_iteration": 3.1636195182800293 }, { "auxiliary_loss_clip": 0.01145219, "auxiliary_loss_mlp": 0.01047411, "balance_loss_clip": 1.05137587, "balance_loss_mlp": 1.02995849, "epoch": 0.31787163685555386, "flos": 18697465762560.0, "grad_norm": 10.257300688850748, "language_loss": 0.72160053, "learning_rate": 3.1918628992502826e-06, "loss": 0.74352682, "num_input_tokens_seen": 113635320, "step": 5287, "time_per_iteration": 2.628863573074341 }, { "auxiliary_loss_clip": 0.01132987, "auxiliary_loss_mlp": 0.0105662, "balance_loss_clip": 1.04966712, "balance_loss_mlp": 1.03823805, "epoch": 0.31793176010822183, "flos": 21324762155520.0, "grad_norm": 2.3229849512265126, "language_loss": 0.75706261, "learning_rate": 3.191550125172792e-06, "loss": 0.77895868, "num_input_tokens_seen": 113654000, "step": 5288, "time_per_iteration": 2.7565319538116455 }, { "auxiliary_loss_clip": 0.01128698, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.04913831, "balance_loss_mlp": 1.02223587, "epoch": 0.31799188336088985, "flos": 20958688696320.0, "grad_norm": 3.550043827117326, "language_loss": 0.87827504, "learning_rate": 3.1912373059116007e-06, "loss": 0.89993572, "num_input_tokens_seen": 113672375, "step": 5289, "time_per_iteration": 2.6671485900878906 }, { "auxiliary_loss_clip": 0.01126628, "auxiliary_loss_mlp": 0.01039655, "balance_loss_clip": 1.05225897, "balance_loss_mlp": 1.02443218, "epoch": 0.3180520066135578, "flos": 22491930689280.0, "grad_norm": 1.767762146387748, "language_loss": 0.68103814, "learning_rate": 3.190924441478572e-06, "loss": 0.70270097, "num_input_tokens_seen": 113692385, "step": 5290, "time_per_iteration": 2.6986947059631348 }, { "auxiliary_loss_clip": 0.01120385, "auxiliary_loss_mlp": 0.01046806, "balance_loss_clip": 1.04791737, "balance_loss_mlp": 1.02924609, "epoch": 0.3181121298662258, "flos": 27235335070080.0, "grad_norm": 2.1353951835610303, "language_loss": 0.80298805, "learning_rate": 3.1906115318855687e-06, "loss": 0.82465994, "num_input_tokens_seen": 113712145, "step": 5291, "time_per_iteration": 2.67692494392395 }, { "auxiliary_loss_clip": 0.01112404, "auxiliary_loss_mlp": 0.01038285, "balance_loss_clip": 1.05768418, "balance_loss_mlp": 1.02066636, "epoch": 0.31817225311889374, "flos": 23180158252800.0, "grad_norm": 4.0426741537939614, "language_loss": 0.79877901, "learning_rate": 3.1902985771444577e-06, "loss": 0.82028592, "num_input_tokens_seen": 113731435, "step": 5292, "time_per_iteration": 2.8386974334716797 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.05076253, "balance_loss_mlp": 1.0233407, "epoch": 0.3182323763715617, "flos": 23258803080960.0, "grad_norm": 1.5696258430885255, "language_loss": 0.74754488, "learning_rate": 3.1899855772671043e-06, "loss": 0.7691924, "num_input_tokens_seen": 113750825, "step": 5293, "time_per_iteration": 2.651566982269287 }, { "auxiliary_loss_clip": 0.01129161, "auxiliary_loss_mlp": 0.01045458, "balance_loss_clip": 1.05253696, "balance_loss_mlp": 1.03027081, "epoch": 0.3182924996242297, "flos": 29016683280000.0, "grad_norm": 1.9205945835079516, "language_loss": 0.74100351, "learning_rate": 3.189672532265379e-06, "loss": 0.76274973, "num_input_tokens_seen": 113770010, "step": 5294, "time_per_iteration": 2.6593024730682373 }, { "auxiliary_loss_clip": 0.01145372, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.05254447, "balance_loss_mlp": 1.02166462, "epoch": 0.31835262287689764, "flos": 20449188230400.0, "grad_norm": 3.618714545146935, "language_loss": 0.76019043, "learning_rate": 3.189359442151152e-06, "loss": 0.78203136, "num_input_tokens_seen": 113788640, "step": 5295, "time_per_iteration": 2.597567558288574 }, { "auxiliary_loss_clip": 0.01110615, "auxiliary_loss_mlp": 0.01046432, "balance_loss_clip": 1.04994202, "balance_loss_mlp": 1.02979052, "epoch": 0.3184127461295656, "flos": 25119478477440.0, "grad_norm": 2.278908740959458, "language_loss": 0.69146252, "learning_rate": 3.189046306936296e-06, "loss": 0.71303296, "num_input_tokens_seen": 113809515, "step": 5296, "time_per_iteration": 4.286029100418091 }, { "auxiliary_loss_clip": 0.01115954, "auxiliary_loss_mlp": 0.01043279, "balance_loss_clip": 1.04866266, "balance_loss_mlp": 1.02709007, "epoch": 0.31847286938223357, "flos": 25551231955200.0, "grad_norm": 1.7786470593469696, "language_loss": 0.77374327, "learning_rate": 3.1887331266326846e-06, "loss": 0.79533565, "num_input_tokens_seen": 113829770, "step": 5297, "time_per_iteration": 4.164870023727417 }, { "auxiliary_loss_clip": 0.0111312, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.05341816, "balance_loss_mlp": 1.01857328, "epoch": 0.31853299263490154, "flos": 27782470010880.0, "grad_norm": 2.4185702861431104, "language_loss": 0.79294181, "learning_rate": 3.1884199012521942e-06, "loss": 0.81443709, "num_input_tokens_seen": 113849320, "step": 5298, "time_per_iteration": 2.761035919189453 }, { "auxiliary_loss_clip": 0.01127152, "auxiliary_loss_mlp": 0.01052383, "balance_loss_clip": 1.05250955, "balance_loss_mlp": 1.0361588, "epoch": 0.3185931158875695, "flos": 22706747976960.0, "grad_norm": 2.109744523678234, "language_loss": 0.74082595, "learning_rate": 3.1881066308067016e-06, "loss": 0.76262128, "num_input_tokens_seen": 113867860, "step": 5299, "time_per_iteration": 2.6674296855926514 }, { "auxiliary_loss_clip": 0.01133842, "auxiliary_loss_mlp": 0.01048899, "balance_loss_clip": 1.05652189, "balance_loss_mlp": 1.03213775, "epoch": 0.31865323914023747, "flos": 24571517523840.0, "grad_norm": 2.0125699214837627, "language_loss": 0.78636098, "learning_rate": 3.1877933153080873e-06, "loss": 0.80818832, "num_input_tokens_seen": 113886375, "step": 5300, "time_per_iteration": 2.721202850341797 }, { "auxiliary_loss_clip": 0.01119633, "auxiliary_loss_mlp": 0.01050293, "balance_loss_clip": 1.04830885, "balance_loss_mlp": 1.03297138, "epoch": 0.31871336239290543, "flos": 18186564666240.0, "grad_norm": 1.8639511619571896, "language_loss": 0.83660495, "learning_rate": 3.1874799547682304e-06, "loss": 0.8583042, "num_input_tokens_seen": 113904065, "step": 5301, "time_per_iteration": 4.22704291343689 }, { "auxiliary_loss_clip": 0.01131996, "auxiliary_loss_mlp": 0.01049945, "balance_loss_clip": 1.05371821, "balance_loss_mlp": 1.03263569, "epoch": 0.31877348564557345, "flos": 21826756679040.0, "grad_norm": 2.3173946845583444, "language_loss": 0.77328432, "learning_rate": 3.187166549199015e-06, "loss": 0.79510373, "num_input_tokens_seen": 113918415, "step": 5302, "time_per_iteration": 2.6678919792175293 }, { "auxiliary_loss_clip": 0.011364, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.04891157, "balance_loss_mlp": 1.02270818, "epoch": 0.3188336088982414, "flos": 22015252275840.0, "grad_norm": 2.352282677018458, "language_loss": 0.79816842, "learning_rate": 3.1868530986123255e-06, "loss": 0.81993073, "num_input_tokens_seen": 113938135, "step": 5303, "time_per_iteration": 4.289660453796387 }, { "auxiliary_loss_clip": 0.0113563, "auxiliary_loss_mlp": 0.01045445, "balance_loss_clip": 1.05256605, "balance_loss_mlp": 1.02739668, "epoch": 0.3188937321509094, "flos": 20047886507520.0, "grad_norm": 2.03328242361333, "language_loss": 0.72914493, "learning_rate": 3.186539603020047e-06, "loss": 0.7509557, "num_input_tokens_seen": 113957125, "step": 5304, "time_per_iteration": 2.6123225688934326 }, { "auxiliary_loss_clip": 0.01106707, "auxiliary_loss_mlp": 0.01038113, "balance_loss_clip": 1.04701817, "balance_loss_mlp": 1.02234125, "epoch": 0.31895385540357735, "flos": 25848105863040.0, "grad_norm": 2.816339992135166, "language_loss": 0.71918428, "learning_rate": 3.186226062434068e-06, "loss": 0.74063241, "num_input_tokens_seen": 113974875, "step": 5305, "time_per_iteration": 2.7341108322143555 }, { "auxiliary_loss_clip": 0.01120594, "auxiliary_loss_mlp": 0.01042646, "balance_loss_clip": 1.05007052, "balance_loss_mlp": 1.0271126, "epoch": 0.3190139786562453, "flos": 23477714519040.0, "grad_norm": 2.1368418928112067, "language_loss": 0.64082253, "learning_rate": 3.1859124768662778e-06, "loss": 0.66245496, "num_input_tokens_seen": 113994450, "step": 5306, "time_per_iteration": 2.678497791290283 }, { "auxiliary_loss_clip": 0.01113987, "auxiliary_loss_mlp": 0.01046306, "balance_loss_clip": 1.04777002, "balance_loss_mlp": 1.02913976, "epoch": 0.3190741019089133, "flos": 29095543589760.0, "grad_norm": 2.249856956834014, "language_loss": 0.7981708, "learning_rate": 3.1855988463285678e-06, "loss": 0.81977379, "num_input_tokens_seen": 114013945, "step": 5307, "time_per_iteration": 2.684825897216797 }, { "auxiliary_loss_clip": 0.01110939, "auxiliary_loss_mlp": 0.01046246, "balance_loss_clip": 1.04708028, "balance_loss_mlp": 1.02869821, "epoch": 0.31913422516158124, "flos": 17129534209920.0, "grad_norm": 1.891192054321282, "language_loss": 0.77413881, "learning_rate": 3.1852851708328308e-06, "loss": 0.79571068, "num_input_tokens_seen": 114031375, "step": 5308, "time_per_iteration": 2.62485408782959 }, { "auxiliary_loss_clip": 0.01142071, "auxiliary_loss_mlp": 0.01050679, "balance_loss_clip": 1.05399549, "balance_loss_mlp": 1.03109312, "epoch": 0.3191943484142492, "flos": 16069846147200.0, "grad_norm": 3.6914677983836586, "language_loss": 0.73960984, "learning_rate": 3.184971450390961e-06, "loss": 0.76153737, "num_input_tokens_seen": 114048465, "step": 5309, "time_per_iteration": 2.6268463134765625 }, { "auxiliary_loss_clip": 0.01134349, "auxiliary_loss_mlp": 0.01035267, "balance_loss_clip": 1.05286658, "balance_loss_mlp": 1.01932931, "epoch": 0.3192544716669172, "flos": 22966166977920.0, "grad_norm": 1.9182514579370458, "language_loss": 0.82652342, "learning_rate": 3.184657685014856e-06, "loss": 0.84821963, "num_input_tokens_seen": 114068415, "step": 5310, "time_per_iteration": 2.649099111557007 }, { "auxiliary_loss_clip": 0.01116653, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 1.04808259, "balance_loss_mlp": 1.02340484, "epoch": 0.31931459491958514, "flos": 26870339018880.0, "grad_norm": 2.200225110342558, "language_loss": 0.78296745, "learning_rate": 3.184343874716412e-06, "loss": 0.80452585, "num_input_tokens_seen": 114088565, "step": 5311, "time_per_iteration": 2.7054250240325928 }, { "auxiliary_loss_clip": 0.01106724, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.04822886, "balance_loss_mlp": 1.01952648, "epoch": 0.3193747181722531, "flos": 21836525178240.0, "grad_norm": 2.0057857548781883, "language_loss": 0.84169972, "learning_rate": 3.1840300195075295e-06, "loss": 0.86313581, "num_input_tokens_seen": 114107160, "step": 5312, "time_per_iteration": 2.749263048171997 }, { "auxiliary_loss_clip": 0.01093899, "auxiliary_loss_mlp": 0.01053441, "balance_loss_clip": 1.04266024, "balance_loss_mlp": 1.03477311, "epoch": 0.31943484142492107, "flos": 18324999682560.0, "grad_norm": 3.6700749085790063, "language_loss": 0.78648412, "learning_rate": 3.1837161194001102e-06, "loss": 0.80795753, "num_input_tokens_seen": 114123420, "step": 5313, "time_per_iteration": 2.720930814743042 }, { "auxiliary_loss_clip": 0.01130677, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.05141878, "balance_loss_mlp": 1.0219605, "epoch": 0.31949496467758903, "flos": 21615818060160.0, "grad_norm": 2.386195329240294, "language_loss": 0.86217451, "learning_rate": 3.183402174406057e-06, "loss": 0.88386285, "num_input_tokens_seen": 114139230, "step": 5314, "time_per_iteration": 2.6785764694213867 }, { "auxiliary_loss_clip": 0.01116655, "auxiliary_loss_mlp": 0.01050856, "balance_loss_clip": 1.04983997, "balance_loss_mlp": 1.03231871, "epoch": 0.31955508793025705, "flos": 21760214734080.0, "grad_norm": 1.996028492072791, "language_loss": 0.79866767, "learning_rate": 3.1830881845372747e-06, "loss": 0.82034278, "num_input_tokens_seen": 114159290, "step": 5315, "time_per_iteration": 2.723097085952759 }, { "auxiliary_loss_clip": 0.0110521, "auxiliary_loss_mlp": 0.01063258, "balance_loss_clip": 1.04667854, "balance_loss_mlp": 1.04386258, "epoch": 0.319615211182925, "flos": 17164331510400.0, "grad_norm": 2.2633227615123275, "language_loss": 0.67312729, "learning_rate": 3.18277414980567e-06, "loss": 0.69481194, "num_input_tokens_seen": 114177655, "step": 5316, "time_per_iteration": 2.7841827869415283 }, { "auxiliary_loss_clip": 0.01131119, "auxiliary_loss_mlp": 0.01046731, "balance_loss_clip": 1.05015874, "balance_loss_mlp": 1.03126907, "epoch": 0.319675334435593, "flos": 28112812416000.0, "grad_norm": 1.540647016415601, "language_loss": 0.69375229, "learning_rate": 3.1824600702231515e-06, "loss": 0.71553081, "num_input_tokens_seen": 114200880, "step": 5317, "time_per_iteration": 2.7080705165863037 }, { "auxiliary_loss_clip": 0.01036788, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.02571428, "balance_loss_mlp": 1.03117692, "epoch": 0.31973545768826095, "flos": 69501119408640.0, "grad_norm": 0.7974882454120521, "language_loss": 0.53049421, "learning_rate": 3.182145945801628e-06, "loss": 0.55119646, "num_input_tokens_seen": 114267145, "step": 5318, "time_per_iteration": 3.5072765350341797 }, { "auxiliary_loss_clip": 0.0114058, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.05322218, "balance_loss_mlp": 1.02509975, "epoch": 0.3197955809409289, "flos": 13699203408000.0, "grad_norm": 3.679429868734815, "language_loss": 0.84239668, "learning_rate": 3.181831776553012e-06, "loss": 0.86421257, "num_input_tokens_seen": 114284630, "step": 5319, "time_per_iteration": 2.6148228645324707 }, { "auxiliary_loss_clip": 0.0112589, "auxiliary_loss_mlp": 0.01041338, "balance_loss_clip": 1.04876614, "balance_loss_mlp": 1.02552485, "epoch": 0.3198557041935969, "flos": 33218124278400.0, "grad_norm": 1.684363339069699, "language_loss": 0.63463295, "learning_rate": 3.1815175624892165e-06, "loss": 0.65630519, "num_input_tokens_seen": 114305830, "step": 5320, "time_per_iteration": 2.7444913387298584 }, { "auxiliary_loss_clip": 0.01120865, "auxiliary_loss_mlp": 0.01042926, "balance_loss_clip": 1.05072045, "balance_loss_mlp": 1.02682114, "epoch": 0.31991582744626484, "flos": 23732033788800.0, "grad_norm": 2.113040492667506, "language_loss": 0.70552826, "learning_rate": 3.1812033036221567e-06, "loss": 0.72716618, "num_input_tokens_seen": 114325165, "step": 5321, "time_per_iteration": 2.7078404426574707 }, { "auxiliary_loss_clip": 0.01151862, "auxiliary_loss_mlp": 0.00776802, "balance_loss_clip": 1.05639851, "balance_loss_mlp": 1.00126243, "epoch": 0.3199759506989328, "flos": 18550842445440.0, "grad_norm": 2.699319417691227, "language_loss": 0.8659147, "learning_rate": 3.180888999963749e-06, "loss": 0.88520133, "num_input_tokens_seen": 114341310, "step": 5322, "time_per_iteration": 2.5562047958374023 }, { "auxiliary_loss_clip": 0.01119411, "auxiliary_loss_mlp": 0.01038951, "balance_loss_clip": 1.05106568, "balance_loss_mlp": 1.02265561, "epoch": 0.3200360739516008, "flos": 22418888382720.0, "grad_norm": 1.7451682184714292, "language_loss": 0.83021653, "learning_rate": 3.1805746515259123e-06, "loss": 0.85180014, "num_input_tokens_seen": 114360355, "step": 5323, "time_per_iteration": 2.6323180198669434 }, { "auxiliary_loss_clip": 0.01129356, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.05092812, "balance_loss_mlp": 1.02440214, "epoch": 0.32009619720426874, "flos": 20595236929920.0, "grad_norm": 1.6785162629315, "language_loss": 0.77686846, "learning_rate": 3.1802602583205663e-06, "loss": 0.79857814, "num_input_tokens_seen": 114379220, "step": 5324, "time_per_iteration": 2.6361289024353027 }, { "auxiliary_loss_clip": 0.01115575, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.04754376, "balance_loss_mlp": 1.01861751, "epoch": 0.3201563204569367, "flos": 18147637301760.0, "grad_norm": 1.9010400542588533, "language_loss": 0.80500418, "learning_rate": 3.1799458203596333e-06, "loss": 0.82651764, "num_input_tokens_seen": 114396365, "step": 5325, "time_per_iteration": 2.681349277496338 }, { "auxiliary_loss_clip": 0.01133585, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.05378425, "balance_loss_mlp": 1.02394414, "epoch": 0.32021644370960467, "flos": 31684235840640.0, "grad_norm": 1.7412856997403743, "language_loss": 0.74817789, "learning_rate": 3.179631337655037e-06, "loss": 0.76991343, "num_input_tokens_seen": 114416780, "step": 5326, "time_per_iteration": 2.6932616233825684 }, { "auxiliary_loss_clip": 0.01103829, "auxiliary_loss_mlp": 0.0104309, "balance_loss_clip": 1.05045807, "balance_loss_mlp": 1.02659154, "epoch": 0.32027656696227264, "flos": 26865921646080.0, "grad_norm": 1.642662123916105, "language_loss": 0.80796289, "learning_rate": 3.179316810218701e-06, "loss": 0.82943213, "num_input_tokens_seen": 114437405, "step": 5327, "time_per_iteration": 2.7527899742126465 }, { "auxiliary_loss_clip": 0.01115203, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.05185604, "balance_loss_mlp": 1.02162015, "epoch": 0.32033669021494066, "flos": 24169928492160.0, "grad_norm": 1.846540372387515, "language_loss": 0.77796161, "learning_rate": 3.179002238062554e-06, "loss": 0.79949659, "num_input_tokens_seen": 114458505, "step": 5328, "time_per_iteration": 2.7631096839904785 }, { "auxiliary_loss_clip": 0.01087281, "auxiliary_loss_mlp": 0.01043102, "balance_loss_clip": 1.0453198, "balance_loss_mlp": 1.0245527, "epoch": 0.3203968134676086, "flos": 24460768915200.0, "grad_norm": 1.6837826518335735, "language_loss": 0.74184239, "learning_rate": 3.178687621198524e-06, "loss": 0.76314622, "num_input_tokens_seen": 114479050, "step": 5329, "time_per_iteration": 2.7749221324920654 }, { "auxiliary_loss_clip": 0.01110066, "auxiliary_loss_mlp": 0.01036662, "balance_loss_clip": 1.04650402, "balance_loss_mlp": 1.02133203, "epoch": 0.3204569367202766, "flos": 18004713085440.0, "grad_norm": 1.7163505659405243, "language_loss": 0.71138644, "learning_rate": 3.1783729596385415e-06, "loss": 0.73285371, "num_input_tokens_seen": 114497415, "step": 5330, "time_per_iteration": 2.655578136444092 }, { "auxiliary_loss_clip": 0.01093261, "auxiliary_loss_mlp": 0.01053955, "balance_loss_clip": 1.05082417, "balance_loss_mlp": 1.03379714, "epoch": 0.32051705997294455, "flos": 30589678650240.0, "grad_norm": 1.6854796065505788, "language_loss": 0.80175424, "learning_rate": 3.1780582533945376e-06, "loss": 0.82322645, "num_input_tokens_seen": 114518785, "step": 5331, "time_per_iteration": 2.851639747619629 }, { "auxiliary_loss_clip": 0.01040347, "auxiliary_loss_mlp": 0.01008357, "balance_loss_clip": 1.02573299, "balance_loss_mlp": 1.0059495, "epoch": 0.3205771832256125, "flos": 68417979765120.0, "grad_norm": 0.8321512232204817, "language_loss": 0.57821107, "learning_rate": 3.177743502478447e-06, "loss": 0.59869808, "num_input_tokens_seen": 114577710, "step": 5332, "time_per_iteration": 3.1104307174682617 }, { "auxiliary_loss_clip": 0.01104131, "auxiliary_loss_mlp": 0.01038271, "balance_loss_clip": 1.04842329, "balance_loss_mlp": 1.02194548, "epoch": 0.3206373064782805, "flos": 30443953173120.0, "grad_norm": 1.7127909178457088, "language_loss": 0.72918129, "learning_rate": 3.177428706902205e-06, "loss": 0.75060534, "num_input_tokens_seen": 114598640, "step": 5333, "time_per_iteration": 2.7683963775634766 }, { "auxiliary_loss_clip": 0.01118957, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.04778981, "balance_loss_mlp": 1.02685761, "epoch": 0.32069742973094845, "flos": 22054502862720.0, "grad_norm": 2.1728626414536767, "language_loss": 0.70592654, "learning_rate": 3.1771138666777485e-06, "loss": 0.72755098, "num_input_tokens_seen": 114618780, "step": 5334, "time_per_iteration": 2.6861116886138916 }, { "auxiliary_loss_clip": 0.01100969, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.04742825, "balance_loss_mlp": 1.02536023, "epoch": 0.3207575529836164, "flos": 22054000072320.0, "grad_norm": 2.526978692505362, "language_loss": 0.77161503, "learning_rate": 3.1767989818170156e-06, "loss": 0.79304117, "num_input_tokens_seen": 114637525, "step": 5335, "time_per_iteration": 4.33164381980896 }, { "auxiliary_loss_clip": 0.01130469, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.05087018, "balance_loss_mlp": 1.02213204, "epoch": 0.3208176762362844, "flos": 34057536186240.0, "grad_norm": 1.6997548644452432, "language_loss": 0.68414462, "learning_rate": 3.1764840523319477e-06, "loss": 0.7058323, "num_input_tokens_seen": 114659705, "step": 5336, "time_per_iteration": 2.840373992919922 }, { "auxiliary_loss_clip": 0.01102432, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.04495001, "balance_loss_mlp": 1.03862596, "epoch": 0.32087779948895234, "flos": 21798711135360.0, "grad_norm": 1.733261513029939, "language_loss": 0.78828537, "learning_rate": 3.176169078234487e-06, "loss": 0.8098622, "num_input_tokens_seen": 114678340, "step": 5337, "time_per_iteration": 4.268811464309692 }, { "auxiliary_loss_clip": 0.01121282, "auxiliary_loss_mlp": 0.01039712, "balance_loss_clip": 1.04696417, "balance_loss_mlp": 1.02512085, "epoch": 0.3209379227416203, "flos": 21434110133760.0, "grad_norm": 2.1583979373304194, "language_loss": 0.74322718, "learning_rate": 3.1758540595365766e-06, "loss": 0.76483715, "num_input_tokens_seen": 114696980, "step": 5338, "time_per_iteration": 2.6442766189575195 }, { "auxiliary_loss_clip": 0.01119062, "auxiliary_loss_mlp": 0.01047297, "balance_loss_clip": 1.04633641, "balance_loss_mlp": 1.03078675, "epoch": 0.3209980459942883, "flos": 25849075530240.0, "grad_norm": 2.118549362741933, "language_loss": 0.62622869, "learning_rate": 3.1755389962501626e-06, "loss": 0.64789224, "num_input_tokens_seen": 114717330, "step": 5339, "time_per_iteration": 2.684843063354492 }, { "auxiliary_loss_clip": 0.01141698, "auxiliary_loss_mlp": 0.01046177, "balance_loss_clip": 1.05127931, "balance_loss_mlp": 1.02954674, "epoch": 0.32105816924695624, "flos": 19099162535040.0, "grad_norm": 2.480509085809345, "language_loss": 0.81685597, "learning_rate": 3.175223888387192e-06, "loss": 0.83873475, "num_input_tokens_seen": 114736320, "step": 5340, "time_per_iteration": 4.130942344665527 }, { "auxiliary_loss_clip": 0.01110441, "auxiliary_loss_mlp": 0.01050741, "balance_loss_clip": 1.04820514, "balance_loss_mlp": 1.03462362, "epoch": 0.3211182924996242, "flos": 16581860565120.0, "grad_norm": 2.326860742494733, "language_loss": 0.76571834, "learning_rate": 3.1749087359596137e-06, "loss": 0.78733015, "num_input_tokens_seen": 114754575, "step": 5341, "time_per_iteration": 2.7302300930023193 }, { "auxiliary_loss_clip": 0.01101828, "auxiliary_loss_mlp": 0.01044591, "balance_loss_clip": 1.04797173, "balance_loss_mlp": 1.02840281, "epoch": 0.3211784157522922, "flos": 22672202071680.0, "grad_norm": 1.680960149410583, "language_loss": 0.79268491, "learning_rate": 3.1745935389793786e-06, "loss": 0.81414914, "num_input_tokens_seen": 114773590, "step": 5342, "time_per_iteration": 4.462036609649658 }, { "auxiliary_loss_clip": 0.01118478, "auxiliary_loss_mlp": 0.01045941, "balance_loss_clip": 1.05000186, "balance_loss_mlp": 1.02876329, "epoch": 0.3212385390049602, "flos": 20558787603840.0, "grad_norm": 3.232512085646521, "language_loss": 0.74449253, "learning_rate": 3.174278297458438e-06, "loss": 0.76613677, "num_input_tokens_seen": 114790775, "step": 5343, "time_per_iteration": 2.7057244777679443 }, { "auxiliary_loss_clip": 0.01080228, "auxiliary_loss_mlp": 0.0104431, "balance_loss_clip": 1.04317784, "balance_loss_mlp": 1.02704811, "epoch": 0.32129866225762815, "flos": 24791147233920.0, "grad_norm": 1.672847320129023, "language_loss": 0.82661629, "learning_rate": 3.173963011408748e-06, "loss": 0.84786165, "num_input_tokens_seen": 114809835, "step": 5344, "time_per_iteration": 2.801013231277466 }, { "auxiliary_loss_clip": 0.01088811, "auxiliary_loss_mlp": 0.01042568, "balance_loss_clip": 1.04556143, "balance_loss_mlp": 1.02565217, "epoch": 0.3213587855102961, "flos": 18366871962240.0, "grad_norm": 22.33494793204904, "language_loss": 0.79863501, "learning_rate": 3.173647680842262e-06, "loss": 0.81994879, "num_input_tokens_seen": 114826505, "step": 5345, "time_per_iteration": 2.743778944015503 }, { "auxiliary_loss_clip": 0.01114864, "auxiliary_loss_mlp": 0.01041047, "balance_loss_clip": 1.04774046, "balance_loss_mlp": 1.02507281, "epoch": 0.3214189087629641, "flos": 27015992668800.0, "grad_norm": 2.095379605818748, "language_loss": 0.83340824, "learning_rate": 3.1733323057709384e-06, "loss": 0.85496742, "num_input_tokens_seen": 114846140, "step": 5346, "time_per_iteration": 2.8187026977539062 }, { "auxiliary_loss_clip": 0.01110187, "auxiliary_loss_mlp": 0.01045041, "balance_loss_clip": 1.04783988, "balance_loss_mlp": 1.02797008, "epoch": 0.32147903201563205, "flos": 23148269953920.0, "grad_norm": 1.6371928172660764, "language_loss": 0.81853002, "learning_rate": 3.1730168862067366e-06, "loss": 0.84008235, "num_input_tokens_seen": 114866660, "step": 5347, "time_per_iteration": 2.724003553390503 }, { "auxiliary_loss_clip": 0.0112676, "auxiliary_loss_mlp": 0.01047135, "balance_loss_clip": 1.048388, "balance_loss_mlp": 1.02891994, "epoch": 0.3215391552683, "flos": 16580747243520.0, "grad_norm": 4.152516057334243, "language_loss": 0.80263776, "learning_rate": 3.1727014221616164e-06, "loss": 0.8243767, "num_input_tokens_seen": 114882820, "step": 5348, "time_per_iteration": 2.6249122619628906 }, { "auxiliary_loss_clip": 0.01113488, "auxiliary_loss_mlp": 0.0105622, "balance_loss_clip": 1.04640627, "balance_loss_mlp": 1.03931606, "epoch": 0.321599278520968, "flos": 17821820010240.0, "grad_norm": 2.570277900111974, "language_loss": 0.85020632, "learning_rate": 3.172385913647542e-06, "loss": 0.87190342, "num_input_tokens_seen": 114900745, "step": 5349, "time_per_iteration": 2.6685211658477783 }, { "auxiliary_loss_clip": 0.01113139, "auxiliary_loss_mlp": 0.0104332, "balance_loss_clip": 1.04840457, "balance_loss_mlp": 1.02644002, "epoch": 0.32165940177363594, "flos": 16251769555200.0, "grad_norm": 2.7209437086115282, "language_loss": 0.80619532, "learning_rate": 3.172070360676475e-06, "loss": 0.82775992, "num_input_tokens_seen": 114917940, "step": 5350, "time_per_iteration": 2.6857874393463135 }, { "auxiliary_loss_clip": 0.01128309, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.05025196, "balance_loss_mlp": 1.02955103, "epoch": 0.3217195250263039, "flos": 27599900158080.0, "grad_norm": 5.5112684101117395, "language_loss": 0.80060112, "learning_rate": 3.1717547632603828e-06, "loss": 0.82233858, "num_input_tokens_seen": 114937735, "step": 5351, "time_per_iteration": 2.68406081199646 }, { "auxiliary_loss_clip": 0.01104774, "auxiliary_loss_mlp": 0.01045518, "balance_loss_clip": 1.04905438, "balance_loss_mlp": 1.02811348, "epoch": 0.3217796482789719, "flos": 21470595373440.0, "grad_norm": 2.189681121413186, "language_loss": 0.75826663, "learning_rate": 3.1714391214112326e-06, "loss": 0.7797696, "num_input_tokens_seen": 114956630, "step": 5352, "time_per_iteration": 2.7035396099090576 }, { "auxiliary_loss_clip": 0.0109763, "auxiliary_loss_mlp": 0.01043305, "balance_loss_clip": 1.04897571, "balance_loss_mlp": 1.02579308, "epoch": 0.32183977153163984, "flos": 21215593745280.0, "grad_norm": 2.4508783518814807, "language_loss": 0.81992233, "learning_rate": 3.1711234351409933e-06, "loss": 0.84133166, "num_input_tokens_seen": 114976470, "step": 5353, "time_per_iteration": 2.731339931488037 }, { "auxiliary_loss_clip": 0.01074627, "auxiliary_loss_mlp": 0.0104331, "balance_loss_clip": 1.04917347, "balance_loss_mlp": 1.02605999, "epoch": 0.3218998947843078, "flos": 24608182331520.0, "grad_norm": 2.2390857397461246, "language_loss": 0.73474252, "learning_rate": 3.1708077044616365e-06, "loss": 0.75592184, "num_input_tokens_seen": 114996710, "step": 5354, "time_per_iteration": 2.8337595462799072 }, { "auxiliary_loss_clip": 0.01103547, "auxiliary_loss_mlp": 0.01039731, "balance_loss_clip": 1.04475546, "balance_loss_mlp": 1.02428102, "epoch": 0.3219600180369758, "flos": 22270577126400.0, "grad_norm": 1.8690515367544651, "language_loss": 0.83792925, "learning_rate": 3.1704919293851334e-06, "loss": 0.85936201, "num_input_tokens_seen": 115015775, "step": 5355, "time_per_iteration": 2.7299652099609375 }, { "auxiliary_loss_clip": 0.01146025, "auxiliary_loss_mlp": 0.01046795, "balance_loss_clip": 1.05450225, "balance_loss_mlp": 1.03032064, "epoch": 0.3220201412896438, "flos": 14939126939520.0, "grad_norm": 1.9705527058452093, "language_loss": 0.70895493, "learning_rate": 3.1701761099234597e-06, "loss": 0.73088312, "num_input_tokens_seen": 115034265, "step": 5356, "time_per_iteration": 2.638268232345581 }, { "auxiliary_loss_clip": 0.01102103, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.04954576, "balance_loss_mlp": 1.02245283, "epoch": 0.32208026454231176, "flos": 22667389649280.0, "grad_norm": 2.5241040535813095, "language_loss": 0.67760962, "learning_rate": 3.1698602460885903e-06, "loss": 0.69903815, "num_input_tokens_seen": 115051945, "step": 5357, "time_per_iteration": 2.7816576957702637 }, { "auxiliary_loss_clip": 0.01037625, "auxiliary_loss_mlp": 0.01029071, "balance_loss_clip": 1.0279882, "balance_loss_mlp": 1.02722347, "epoch": 0.3221403877949797, "flos": 64605130053120.0, "grad_norm": 0.7244200234208643, "language_loss": 0.58319688, "learning_rate": 3.1695443378925035e-06, "loss": 0.60386384, "num_input_tokens_seen": 115119090, "step": 5358, "time_per_iteration": 3.3341448307037354 }, { "auxiliary_loss_clip": 0.01076802, "auxiliary_loss_mlp": 0.01044493, "balance_loss_clip": 1.04142976, "balance_loss_mlp": 1.0270052, "epoch": 0.3222005110476477, "flos": 20157019004160.0, "grad_norm": 2.2322811787478427, "language_loss": 0.83184302, "learning_rate": 3.1692283853471777e-06, "loss": 0.85305595, "num_input_tokens_seen": 115137755, "step": 5359, "time_per_iteration": 2.836543083190918 }, { "auxiliary_loss_clip": 0.01129966, "auxiliary_loss_mlp": 0.01035598, "balance_loss_clip": 1.04800034, "balance_loss_mlp": 1.01938617, "epoch": 0.32226063430031565, "flos": 22674177319680.0, "grad_norm": 2.0261007556732964, "language_loss": 0.79563689, "learning_rate": 3.168912388464595e-06, "loss": 0.81729257, "num_input_tokens_seen": 115158150, "step": 5360, "time_per_iteration": 2.66043758392334 }, { "auxiliary_loss_clip": 0.01045199, "auxiliary_loss_mlp": 0.01009155, "balance_loss_clip": 1.02352595, "balance_loss_mlp": 1.00706911, "epoch": 0.3223207575529836, "flos": 63828525075840.0, "grad_norm": 0.6569282603798298, "language_loss": 0.56928504, "learning_rate": 3.168596347256737e-06, "loss": 0.58982855, "num_input_tokens_seen": 115212755, "step": 5361, "time_per_iteration": 3.007119655609131 }, { "auxiliary_loss_clip": 0.01078785, "auxiliary_loss_mlp": 0.01049092, "balance_loss_clip": 1.04366553, "balance_loss_mlp": 1.03166366, "epoch": 0.3223808808056516, "flos": 26870123537280.0, "grad_norm": 3.2787914187636495, "language_loss": 0.71563178, "learning_rate": 3.168280261735588e-06, "loss": 0.73691058, "num_input_tokens_seen": 115233090, "step": 5362, "time_per_iteration": 2.8345048427581787 }, { "auxiliary_loss_clip": 0.0112485, "auxiliary_loss_mlp": 0.01053523, "balance_loss_clip": 1.04899716, "balance_loss_mlp": 1.03670287, "epoch": 0.32244100405831955, "flos": 26761350176640.0, "grad_norm": 2.1292104037374773, "language_loss": 0.74106693, "learning_rate": 3.167964131913135e-06, "loss": 0.76285076, "num_input_tokens_seen": 115252645, "step": 5363, "time_per_iteration": 2.70552659034729 }, { "auxiliary_loss_clip": 0.01134941, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.05024791, "balance_loss_mlp": 1.02637601, "epoch": 0.3225011273109875, "flos": 23803029020160.0, "grad_norm": 3.812297759050374, "language_loss": 0.77379405, "learning_rate": 3.167647957801365e-06, "loss": 0.7955696, "num_input_tokens_seen": 115269085, "step": 5364, "time_per_iteration": 2.66058087348938 }, { "auxiliary_loss_clip": 0.01120766, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.05058861, "balance_loss_mlp": 1.02468252, "epoch": 0.3225612505636555, "flos": 17274505501440.0, "grad_norm": 3.514939630870356, "language_loss": 0.76727009, "learning_rate": 3.1673317394122672e-06, "loss": 0.78890389, "num_input_tokens_seen": 115286470, "step": 5365, "time_per_iteration": 2.6493194103240967 }, { "auxiliary_loss_clip": 0.01124156, "auxiliary_loss_mlp": 0.01048476, "balance_loss_clip": 1.05429566, "balance_loss_mlp": 1.03201342, "epoch": 0.32262137381632344, "flos": 23366247638400.0, "grad_norm": 7.419360933702927, "language_loss": 0.76938248, "learning_rate": 3.1670154767578333e-06, "loss": 0.79110885, "num_input_tokens_seen": 115307000, "step": 5366, "time_per_iteration": 2.6984689235687256 }, { "auxiliary_loss_clip": 0.01110868, "auxiliary_loss_mlp": 0.01044399, "balance_loss_clip": 1.04554594, "balance_loss_mlp": 1.02792382, "epoch": 0.3226814970689914, "flos": 23258803080960.0, "grad_norm": 2.2843777844497453, "language_loss": 0.71972823, "learning_rate": 3.166699169850055e-06, "loss": 0.74128091, "num_input_tokens_seen": 115325925, "step": 5367, "time_per_iteration": 2.6944496631622314 }, { "auxiliary_loss_clip": 0.01138096, "auxiliary_loss_mlp": 0.01043716, "balance_loss_clip": 1.05035067, "balance_loss_mlp": 1.0286001, "epoch": 0.32274162032165943, "flos": 16395196561920.0, "grad_norm": 13.04054524246424, "language_loss": 0.74414504, "learning_rate": 3.1663828187009274e-06, "loss": 0.76596308, "num_input_tokens_seen": 115343705, "step": 5368, "time_per_iteration": 2.670567750930786 }, { "auxiliary_loss_clip": 0.01103298, "auxiliary_loss_mlp": 0.01049074, "balance_loss_clip": 1.04370904, "balance_loss_mlp": 1.0322659, "epoch": 0.3228017435743274, "flos": 27855081354240.0, "grad_norm": 1.655769512058306, "language_loss": 0.78693509, "learning_rate": 3.1660664233224467e-06, "loss": 0.80845881, "num_input_tokens_seen": 115364170, "step": 5369, "time_per_iteration": 2.777437448501587 }, { "auxiliary_loss_clip": 0.01099309, "auxiliary_loss_mlp": 0.01037821, "balance_loss_clip": 1.04874706, "balance_loss_mlp": 1.0222764, "epoch": 0.32286186682699536, "flos": 19608770741760.0, "grad_norm": 13.189929997499553, "language_loss": 0.83189309, "learning_rate": 3.16574998372661e-06, "loss": 0.85326445, "num_input_tokens_seen": 115382495, "step": 5370, "time_per_iteration": 2.734342336654663 }, { "auxiliary_loss_clip": 0.01141788, "auxiliary_loss_mlp": 0.01044735, "balance_loss_clip": 1.05202413, "balance_loss_mlp": 1.0291779, "epoch": 0.3229219900796633, "flos": 24134017870080.0, "grad_norm": 3.3293058605981614, "language_loss": 0.8288244, "learning_rate": 3.1654334999254177e-06, "loss": 0.85068965, "num_input_tokens_seen": 115399450, "step": 5371, "time_per_iteration": 2.620091676712036 }, { "auxiliary_loss_clip": 0.01133164, "auxiliary_loss_mlp": 0.00776239, "balance_loss_clip": 1.05046356, "balance_loss_mlp": 1.00122416, "epoch": 0.3229821133323313, "flos": 17748705876480.0, "grad_norm": 3.1117013800624993, "language_loss": 0.8852632, "learning_rate": 3.1651169719308695e-06, "loss": 0.90435725, "num_input_tokens_seen": 115417700, "step": 5372, "time_per_iteration": 2.673567056655884 }, { "auxiliary_loss_clip": 0.01140269, "auxiliary_loss_mlp": 0.01049295, "balance_loss_clip": 1.05098414, "balance_loss_mlp": 1.03341591, "epoch": 0.32304223658499925, "flos": 22346025644160.0, "grad_norm": 2.7114986433136727, "language_loss": 0.73388374, "learning_rate": 3.1648003997549694e-06, "loss": 0.75577939, "num_input_tokens_seen": 115435840, "step": 5373, "time_per_iteration": 2.6910293102264404 }, { "auxiliary_loss_clip": 0.0110976, "auxiliary_loss_mlp": 0.01044756, "balance_loss_clip": 1.04653084, "balance_loss_mlp": 1.02873468, "epoch": 0.3231023598376672, "flos": 18478302929280.0, "grad_norm": 2.3161305262959573, "language_loss": 0.81114149, "learning_rate": 3.1644837834097214e-06, "loss": 0.83268672, "num_input_tokens_seen": 115454210, "step": 5374, "time_per_iteration": 2.666707992553711 }, { "auxiliary_loss_clip": 0.01095169, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.0438931, "balance_loss_mlp": 1.02254975, "epoch": 0.3231624830903352, "flos": 27636313570560.0, "grad_norm": 2.1309099752285863, "language_loss": 0.87817222, "learning_rate": 3.1641671229071317e-06, "loss": 0.89951062, "num_input_tokens_seen": 115471785, "step": 5375, "time_per_iteration": 4.252593994140625 }, { "auxiliary_loss_clip": 0.01140942, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.04865098, "balance_loss_mlp": 1.01960015, "epoch": 0.32322260634300315, "flos": 21726423014400.0, "grad_norm": 2.12002794330764, "language_loss": 0.75837636, "learning_rate": 3.1638504182592076e-06, "loss": 0.78014749, "num_input_tokens_seen": 115491405, "step": 5376, "time_per_iteration": 2.64569091796875 }, { "auxiliary_loss_clip": 0.01100111, "auxiliary_loss_mlp": 0.01037893, "balance_loss_clip": 1.04745007, "balance_loss_mlp": 1.0227654, "epoch": 0.3232827295956711, "flos": 22637656166400.0, "grad_norm": 16.356053535517315, "language_loss": 0.66570163, "learning_rate": 3.1635336694779594e-06, "loss": 0.68708175, "num_input_tokens_seen": 115511555, "step": 5377, "time_per_iteration": 4.228315591812134 }, { "auxiliary_loss_clip": 0.01103406, "auxiliary_loss_mlp": 0.01059488, "balance_loss_clip": 1.04591548, "balance_loss_mlp": 1.04070055, "epoch": 0.3233428528483391, "flos": 26322593546880.0, "grad_norm": 1.5026052482517693, "language_loss": 0.72276354, "learning_rate": 3.1632168765753982e-06, "loss": 0.74439251, "num_input_tokens_seen": 115532860, "step": 5378, "time_per_iteration": 2.7754812240600586 }, { "auxiliary_loss_clip": 0.0112205, "auxiliary_loss_mlp": 0.0103656, "balance_loss_clip": 1.04869092, "balance_loss_mlp": 1.0214678, "epoch": 0.32340297610100704, "flos": 28585217111040.0, "grad_norm": 2.7898138283200344, "language_loss": 0.82221997, "learning_rate": 3.1629000395635357e-06, "loss": 0.84380603, "num_input_tokens_seen": 115553850, "step": 5379, "time_per_iteration": 2.672743320465088 }, { "auxiliary_loss_clip": 0.01130962, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.04864693, "balance_loss_mlp": 1.02083325, "epoch": 0.323463099353675, "flos": 30773792787840.0, "grad_norm": 1.5555457678220286, "language_loss": 0.78895414, "learning_rate": 3.162583158454388e-06, "loss": 0.81062359, "num_input_tokens_seen": 115575530, "step": 5380, "time_per_iteration": 4.130786180496216 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.04988194, "balance_loss_mlp": 1.0286541, "epoch": 0.32352322260634303, "flos": 25228610974080.0, "grad_norm": 1.7365933554134192, "language_loss": 0.76877856, "learning_rate": 3.1622662332599697e-06, "loss": 0.79046834, "num_input_tokens_seen": 115594885, "step": 5381, "time_per_iteration": 2.6297740936279297 }, { "auxiliary_loss_clip": 0.01122723, "auxiliary_loss_mlp": 0.0103758, "balance_loss_clip": 1.0485673, "balance_loss_mlp": 1.02333474, "epoch": 0.323583345859011, "flos": 23330480670720.0, "grad_norm": 1.9510545380996942, "language_loss": 0.71868116, "learning_rate": 3.1619492639922998e-06, "loss": 0.7402842, "num_input_tokens_seen": 115614080, "step": 5382, "time_per_iteration": 4.239168167114258 }, { "auxiliary_loss_clip": 0.01114051, "auxiliary_loss_mlp": 0.01051511, "balance_loss_clip": 1.0454843, "balance_loss_mlp": 1.03392792, "epoch": 0.32364346911167896, "flos": 26207499392640.0, "grad_norm": 2.5669193665709815, "language_loss": 0.70947385, "learning_rate": 3.1616322506633964e-06, "loss": 0.73112947, "num_input_tokens_seen": 115632820, "step": 5383, "time_per_iteration": 2.701462507247925 }, { "auxiliary_loss_clip": 0.01123558, "auxiliary_loss_mlp": 0.01038956, "balance_loss_clip": 1.04770291, "balance_loss_mlp": 1.02382779, "epoch": 0.3237035923643469, "flos": 23695764030720.0, "grad_norm": 1.9442688765107798, "language_loss": 0.78333974, "learning_rate": 3.161315193285283e-06, "loss": 0.8049649, "num_input_tokens_seen": 115652860, "step": 5384, "time_per_iteration": 2.6939637660980225 }, { "auxiliary_loss_clip": 0.01078749, "auxiliary_loss_mlp": 0.01050129, "balance_loss_clip": 1.04298878, "balance_loss_mlp": 1.03203273, "epoch": 0.3237637156170149, "flos": 14428728633600.0, "grad_norm": 2.1298780259276575, "language_loss": 0.75396919, "learning_rate": 3.16099809186998e-06, "loss": 0.77525795, "num_input_tokens_seen": 115670940, "step": 5385, "time_per_iteration": 2.7813403606414795 }, { "auxiliary_loss_clip": 0.0111287, "auxiliary_loss_mlp": 0.01040739, "balance_loss_clip": 1.04995322, "balance_loss_mlp": 1.0248363, "epoch": 0.32382383886968286, "flos": 31062981185280.0, "grad_norm": 2.042597717530735, "language_loss": 0.71488941, "learning_rate": 3.1606809464295145e-06, "loss": 0.73642552, "num_input_tokens_seen": 115691155, "step": 5386, "time_per_iteration": 2.754636526107788 }, { "auxiliary_loss_clip": 0.01142583, "auxiliary_loss_mlp": 0.01040273, "balance_loss_clip": 1.0499016, "balance_loss_mlp": 1.02334547, "epoch": 0.3238839621223508, "flos": 23256935573760.0, "grad_norm": 5.057227062214219, "language_loss": 0.94889075, "learning_rate": 3.1603637569759095e-06, "loss": 0.97071928, "num_input_tokens_seen": 115710340, "step": 5387, "time_per_iteration": 2.6547048091888428 }, { "auxiliary_loss_clip": 0.01133488, "auxiliary_loss_mlp": 0.01044118, "balance_loss_clip": 1.05193102, "balance_loss_mlp": 1.02696419, "epoch": 0.3239440853750188, "flos": 22964658606720.0, "grad_norm": 10.717385990424205, "language_loss": 0.77620786, "learning_rate": 3.1600465235211956e-06, "loss": 0.79798394, "num_input_tokens_seen": 115726745, "step": 5388, "time_per_iteration": 2.657205820083618 }, { "auxiliary_loss_clip": 0.01111832, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.04523969, "balance_loss_mlp": 1.01978493, "epoch": 0.32400420862768675, "flos": 36246614653440.0, "grad_norm": 2.237731185409586, "language_loss": 0.71233571, "learning_rate": 3.1597292460774006e-06, "loss": 0.73382103, "num_input_tokens_seen": 115749385, "step": 5389, "time_per_iteration": 2.799731731414795 }, { "auxiliary_loss_clip": 0.01099836, "auxiliary_loss_mlp": 0.01038996, "balance_loss_clip": 1.04759645, "balance_loss_mlp": 1.02302158, "epoch": 0.3240643318803547, "flos": 21616500418560.0, "grad_norm": 1.8547230503773184, "language_loss": 0.80461568, "learning_rate": 3.159411924656557e-06, "loss": 0.82600403, "num_input_tokens_seen": 115768105, "step": 5390, "time_per_iteration": 2.703913450241089 }, { "auxiliary_loss_clip": 0.01112322, "auxiliary_loss_mlp": 0.01050073, "balance_loss_clip": 1.04881656, "balance_loss_mlp": 1.0330621, "epoch": 0.3241244551330227, "flos": 23295611543040.0, "grad_norm": 4.514534114801655, "language_loss": 0.72674775, "learning_rate": 3.1590945592706967e-06, "loss": 0.74837172, "num_input_tokens_seen": 115787340, "step": 5391, "time_per_iteration": 2.8789660930633545 }, { "auxiliary_loss_clip": 0.01110171, "auxiliary_loss_mlp": 0.01040459, "balance_loss_clip": 1.04422975, "balance_loss_mlp": 1.02517664, "epoch": 0.32418457838569065, "flos": 14097236993280.0, "grad_norm": 2.092129040046021, "language_loss": 0.77347648, "learning_rate": 3.158777149931855e-06, "loss": 0.79498285, "num_input_tokens_seen": 115805565, "step": 5392, "time_per_iteration": 2.6689188480377197 }, { "auxiliary_loss_clip": 0.01112252, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.04517519, "balance_loss_mlp": 1.03289127, "epoch": 0.3242447016383586, "flos": 29752672953600.0, "grad_norm": 1.9207699243041063, "language_loss": 0.62606925, "learning_rate": 3.158459696652067e-06, "loss": 0.6477111, "num_input_tokens_seen": 115826725, "step": 5393, "time_per_iteration": 2.758423328399658 }, { "auxiliary_loss_clip": 0.01122257, "auxiliary_loss_mlp": 0.01043934, "balance_loss_clip": 1.04730856, "balance_loss_mlp": 1.02770925, "epoch": 0.3243048248910266, "flos": 24351205455360.0, "grad_norm": 1.583732116281239, "language_loss": 0.82284617, "learning_rate": 3.158142199443371e-06, "loss": 0.84450811, "num_input_tokens_seen": 115846955, "step": 5394, "time_per_iteration": 2.6715636253356934 }, { "auxiliary_loss_clip": 0.01111969, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.04729748, "balance_loss_mlp": 1.03120947, "epoch": 0.3243649481436946, "flos": 24353037048960.0, "grad_norm": 1.873068954405441, "language_loss": 0.817029, "learning_rate": 3.1578246583178076e-06, "loss": 0.83860689, "num_input_tokens_seen": 115865975, "step": 5395, "time_per_iteration": 2.7120518684387207 }, { "auxiliary_loss_clip": 0.01126983, "auxiliary_loss_mlp": 0.01039478, "balance_loss_clip": 1.0519104, "balance_loss_mlp": 1.02413607, "epoch": 0.32442507139636256, "flos": 22925228451840.0, "grad_norm": 1.8441183317386671, "language_loss": 0.83172363, "learning_rate": 3.157507073287417e-06, "loss": 0.85338825, "num_input_tokens_seen": 115884950, "step": 5396, "time_per_iteration": 2.6589252948760986 }, { "auxiliary_loss_clip": 0.0110371, "auxiliary_loss_mlp": 0.01053141, "balance_loss_clip": 1.04818082, "balance_loss_mlp": 1.03462827, "epoch": 0.32448519464903053, "flos": 22200192426240.0, "grad_norm": 2.3735724483298553, "language_loss": 0.75721765, "learning_rate": 3.1571894443642414e-06, "loss": 0.77878618, "num_input_tokens_seen": 115904170, "step": 5397, "time_per_iteration": 2.7118513584136963 }, { "auxiliary_loss_clip": 0.01104001, "auxiliary_loss_mlp": 0.0104059, "balance_loss_clip": 1.04970932, "balance_loss_mlp": 1.02504468, "epoch": 0.3245453179016985, "flos": 18838450644480.0, "grad_norm": 7.349892433890134, "language_loss": 0.67359912, "learning_rate": 3.1568717715603263e-06, "loss": 0.69504505, "num_input_tokens_seen": 115919255, "step": 5398, "time_per_iteration": 2.690317153930664 }, { "auxiliary_loss_clip": 0.01111486, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.04846239, "balance_loss_mlp": 1.01784301, "epoch": 0.32460544115436646, "flos": 21178390233600.0, "grad_norm": 1.692830304346276, "language_loss": 0.73074687, "learning_rate": 3.156554054887718e-06, "loss": 0.7521975, "num_input_tokens_seen": 115938535, "step": 5399, "time_per_iteration": 2.754539728164673 }, { "auxiliary_loss_clip": 0.01101582, "auxiliary_loss_mlp": 0.01036858, "balance_loss_clip": 1.04522848, "balance_loss_mlp": 1.02056217, "epoch": 0.3246655644070344, "flos": 21981137333760.0, "grad_norm": 2.780796864612311, "language_loss": 0.71580744, "learning_rate": 3.1562362943584645e-06, "loss": 0.7371918, "num_input_tokens_seen": 115955005, "step": 5400, "time_per_iteration": 2.707712173461914 }, { "auxiliary_loss_clip": 0.01127225, "auxiliary_loss_mlp": 0.01040347, "balance_loss_clip": 1.0472424, "balance_loss_mlp": 1.02469516, "epoch": 0.3247256876597024, "flos": 32159729105280.0, "grad_norm": 2.1905750946262805, "language_loss": 0.79769576, "learning_rate": 3.155918489984614e-06, "loss": 0.81937146, "num_input_tokens_seen": 115975305, "step": 5401, "time_per_iteration": 2.7813303470611572 }, { "auxiliary_loss_clip": 0.01109499, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.04414558, "balance_loss_mlp": 1.02341187, "epoch": 0.32478581091237035, "flos": 20997544233600.0, "grad_norm": 4.743153882711402, "language_loss": 0.87785316, "learning_rate": 3.1556006417782196e-06, "loss": 0.89936143, "num_input_tokens_seen": 115994810, "step": 5402, "time_per_iteration": 2.7685606479644775 }, { "auxiliary_loss_clip": 0.01078796, "auxiliary_loss_mlp": 0.01044786, "balance_loss_clip": 1.03948891, "balance_loss_mlp": 1.02792931, "epoch": 0.3248459341650383, "flos": 17924990849280.0, "grad_norm": 4.964706141121962, "language_loss": 0.84572911, "learning_rate": 3.155282749751332e-06, "loss": 0.86696494, "num_input_tokens_seen": 116011095, "step": 5403, "time_per_iteration": 2.7299063205718994 }, { "auxiliary_loss_clip": 0.01104053, "auxiliary_loss_mlp": 0.01045074, "balance_loss_clip": 1.04597795, "balance_loss_mlp": 1.03049469, "epoch": 0.3249060574177063, "flos": 24535606901760.0, "grad_norm": 3.7265891750540785, "language_loss": 0.87614954, "learning_rate": 3.154964813916007e-06, "loss": 0.89764082, "num_input_tokens_seen": 116028805, "step": 5404, "time_per_iteration": 2.7740931510925293 }, { "auxiliary_loss_clip": 0.01125798, "auxiliary_loss_mlp": 0.01043439, "balance_loss_clip": 1.04930234, "balance_loss_mlp": 1.02685738, "epoch": 0.32496618067037425, "flos": 25994765093760.0, "grad_norm": 2.5497237434599964, "language_loss": 0.72717422, "learning_rate": 3.1546468342843008e-06, "loss": 0.74886656, "num_input_tokens_seen": 116047765, "step": 5405, "time_per_iteration": 2.6756839752197266 }, { "auxiliary_loss_clip": 0.01098309, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.04964566, "balance_loss_mlp": 1.02390265, "epoch": 0.3250263039230422, "flos": 19573757959680.0, "grad_norm": 1.6968031771183532, "language_loss": 0.82927752, "learning_rate": 3.1543288108682707e-06, "loss": 0.8506552, "num_input_tokens_seen": 116068385, "step": 5406, "time_per_iteration": 2.728217124938965 }, { "auxiliary_loss_clip": 0.01136878, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.05117011, "balance_loss_mlp": 1.01728487, "epoch": 0.3250864271757102, "flos": 16763640318720.0, "grad_norm": 1.9312900503750694, "language_loss": 0.87836796, "learning_rate": 3.1540107436799764e-06, "loss": 0.90005869, "num_input_tokens_seen": 116085350, "step": 5407, "time_per_iteration": 2.5519261360168457 }, { "auxiliary_loss_clip": 0.01112002, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.04575169, "balance_loss_mlp": 1.02506793, "epoch": 0.3251465504283782, "flos": 27819458040960.0, "grad_norm": 1.6044550363094983, "language_loss": 0.69804603, "learning_rate": 3.153692632731479e-06, "loss": 0.71957088, "num_input_tokens_seen": 116107560, "step": 5408, "time_per_iteration": 2.7141807079315186 }, { "auxiliary_loss_clip": 0.01131975, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.05021083, "balance_loss_mlp": 1.01977742, "epoch": 0.32520667368104617, "flos": 19063144172160.0, "grad_norm": 10.423580562540607, "language_loss": 0.77558911, "learning_rate": 3.153374478034841e-06, "loss": 0.79726762, "num_input_tokens_seen": 116125980, "step": 5409, "time_per_iteration": 2.644792318344116 }, { "auxiliary_loss_clip": 0.01079567, "auxiliary_loss_mlp": 0.01043858, "balance_loss_clip": 1.03893065, "balance_loss_mlp": 1.0280745, "epoch": 0.32526679693371413, "flos": 29382146208000.0, "grad_norm": 2.0524453166640146, "language_loss": 0.83282518, "learning_rate": 3.1530562796021285e-06, "loss": 0.85405946, "num_input_tokens_seen": 116146530, "step": 5410, "time_per_iteration": 2.846480131149292 }, { "auxiliary_loss_clip": 0.01086095, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.04789686, "balance_loss_mlp": 1.02272296, "epoch": 0.3253269201863821, "flos": 20704513080960.0, "grad_norm": 1.6475099523255856, "language_loss": 0.7081182, "learning_rate": 3.152738037445405e-06, "loss": 0.72935545, "num_input_tokens_seen": 116165695, "step": 5411, "time_per_iteration": 2.779330253601074 }, { "auxiliary_loss_clip": 0.0108148, "auxiliary_loss_mlp": 0.01041588, "balance_loss_clip": 1.04331398, "balance_loss_mlp": 1.02688956, "epoch": 0.32538704343905006, "flos": 29094142959360.0, "grad_norm": 1.6354124554173295, "language_loss": 0.82894456, "learning_rate": 3.1524197515767403e-06, "loss": 0.85017526, "num_input_tokens_seen": 116185375, "step": 5412, "time_per_iteration": 2.7841992378234863 }, { "auxiliary_loss_clip": 0.01106895, "auxiliary_loss_mlp": 0.01041599, "balance_loss_clip": 1.04730868, "balance_loss_mlp": 1.02430189, "epoch": 0.325447166691718, "flos": 24676124906880.0, "grad_norm": 1.867437266565155, "language_loss": 0.80913842, "learning_rate": 3.152101422008203e-06, "loss": 0.83062339, "num_input_tokens_seen": 116204335, "step": 5413, "time_per_iteration": 2.7533957958221436 }, { "auxiliary_loss_clip": 0.01115005, "auxiliary_loss_mlp": 0.0103855, "balance_loss_clip": 1.04923081, "balance_loss_mlp": 1.02155089, "epoch": 0.325507289944386, "flos": 21543134889600.0, "grad_norm": 3.355430774898342, "language_loss": 0.76891947, "learning_rate": 3.151783048751864e-06, "loss": 0.79045498, "num_input_tokens_seen": 116222840, "step": 5414, "time_per_iteration": 4.331217527389526 }, { "auxiliary_loss_clip": 0.01030644, "auxiliary_loss_mlp": 0.01012699, "balance_loss_clip": 1.02726388, "balance_loss_mlp": 1.01063681, "epoch": 0.32556741319705396, "flos": 71518722347520.0, "grad_norm": 0.9066964616955783, "language_loss": 0.63865513, "learning_rate": 3.1514646318197965e-06, "loss": 0.65908855, "num_input_tokens_seen": 116274940, "step": 5415, "time_per_iteration": 3.172816753387451 }, { "auxiliary_loss_clip": 0.01088465, "auxiliary_loss_mlp": 0.01038606, "balance_loss_clip": 1.04119301, "balance_loss_mlp": 1.02279866, "epoch": 0.3256275364497219, "flos": 23732428838400.0, "grad_norm": 1.52454367487569, "language_loss": 0.74014068, "learning_rate": 3.151146171224075e-06, "loss": 0.76141143, "num_input_tokens_seen": 116297300, "step": 5416, "time_per_iteration": 4.326166868209839 }, { "auxiliary_loss_clip": 0.01062287, "auxiliary_loss_mlp": 0.0100407, "balance_loss_clip": 1.03045964, "balance_loss_mlp": 1.00160217, "epoch": 0.3256876597023899, "flos": 67289199891840.0, "grad_norm": 0.7686966052914506, "language_loss": 0.57851374, "learning_rate": 3.1508276669767757e-06, "loss": 0.59917736, "num_input_tokens_seen": 116362370, "step": 5417, "time_per_iteration": 3.2102463245391846 }, { "auxiliary_loss_clip": 0.01040835, "auxiliary_loss_mlp": 0.01012103, "balance_loss_clip": 1.02768993, "balance_loss_mlp": 1.00975466, "epoch": 0.32574778295505785, "flos": 71282323964160.0, "grad_norm": 0.7997987203444133, "language_loss": 0.63392216, "learning_rate": 3.150509119089975e-06, "loss": 0.65445155, "num_input_tokens_seen": 116430365, "step": 5418, "time_per_iteration": 4.847350120544434 }, { "auxiliary_loss_clip": 0.01110249, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.05171919, "balance_loss_mlp": 1.02794838, "epoch": 0.3258079062077258, "flos": 20776370238720.0, "grad_norm": 2.0985111563442325, "language_loss": 0.69086784, "learning_rate": 3.1501905275757537e-06, "loss": 0.71240497, "num_input_tokens_seen": 116447525, "step": 5419, "time_per_iteration": 2.6837174892425537 }, { "auxiliary_loss_clip": 0.0112744, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.05152702, "balance_loss_mlp": 1.02099252, "epoch": 0.3258680294603938, "flos": 22235456603520.0, "grad_norm": 1.6553118170887535, "language_loss": 0.77041519, "learning_rate": 3.1498718924461926e-06, "loss": 0.79206121, "num_input_tokens_seen": 116466310, "step": 5420, "time_per_iteration": 2.690243721008301 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.00774579, "balance_loss_clip": 1.04583097, "balance_loss_mlp": 1.00118852, "epoch": 0.3259281527130618, "flos": 26979974305920.0, "grad_norm": 1.6758047570714483, "language_loss": 0.8033973, "learning_rate": 3.1495532137133736e-06, "loss": 0.82238531, "num_input_tokens_seen": 116487825, "step": 5421, "time_per_iteration": 4.346652984619141 }, { "auxiliary_loss_clip": 0.01133401, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.04982162, "balance_loss_mlp": 1.0212909, "epoch": 0.32598827596572977, "flos": 26214251149440.0, "grad_norm": 1.7368751669124027, "language_loss": 0.75101721, "learning_rate": 3.149234491389381e-06, "loss": 0.77270067, "num_input_tokens_seen": 116509950, "step": 5422, "time_per_iteration": 2.698486566543579 }, { "auxiliary_loss_clip": 0.01104722, "auxiliary_loss_mlp": 0.00773675, "balance_loss_clip": 1.04894829, "balance_loss_mlp": 1.00120938, "epoch": 0.32604839921839773, "flos": 17639752947840.0, "grad_norm": 2.1580318636917384, "language_loss": 0.63323581, "learning_rate": 3.1489157254863026e-06, "loss": 0.65201974, "num_input_tokens_seen": 116527695, "step": 5423, "time_per_iteration": 2.7364964485168457 }, { "auxiliary_loss_clip": 0.01098661, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.04357564, "balance_loss_mlp": 1.01884615, "epoch": 0.3261085224710657, "flos": 23622721724160.0, "grad_norm": 1.5676988826806029, "language_loss": 0.74530792, "learning_rate": 3.148596916016224e-06, "loss": 0.76661909, "num_input_tokens_seen": 116547800, "step": 5424, "time_per_iteration": 2.695530652999878 }, { "auxiliary_loss_clip": 0.0110482, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.04803681, "balance_loss_mlp": 1.02199221, "epoch": 0.32616864572373366, "flos": 23260455106560.0, "grad_norm": 1.6667522289255576, "language_loss": 0.77194774, "learning_rate": 3.1482780629912355e-06, "loss": 0.79335308, "num_input_tokens_seen": 116568460, "step": 5425, "time_per_iteration": 2.6649699211120605 }, { "auxiliary_loss_clip": 0.01106187, "auxiliary_loss_mlp": 0.01040306, "balance_loss_clip": 1.04740202, "balance_loss_mlp": 1.02368808, "epoch": 0.32622876897640163, "flos": 25593427457280.0, "grad_norm": 2.8883064562409744, "language_loss": 0.78262472, "learning_rate": 3.147959166423428e-06, "loss": 0.80408967, "num_input_tokens_seen": 116588705, "step": 5426, "time_per_iteration": 2.7820892333984375 }, { "auxiliary_loss_clip": 0.01088898, "auxiliary_loss_mlp": 0.01035243, "balance_loss_clip": 1.04331303, "balance_loss_mlp": 1.01889908, "epoch": 0.3262888922290696, "flos": 22418996123520.0, "grad_norm": 1.9267107865215556, "language_loss": 0.74485052, "learning_rate": 3.147640226324893e-06, "loss": 0.76609194, "num_input_tokens_seen": 116608845, "step": 5427, "time_per_iteration": 2.7831003665924072 }, { "auxiliary_loss_clip": 0.01103791, "auxiliary_loss_mlp": 0.01041786, "balance_loss_clip": 1.04539597, "balance_loss_mlp": 1.02549028, "epoch": 0.32634901548173756, "flos": 19718908819200.0, "grad_norm": 6.869638277775165, "language_loss": 0.79136658, "learning_rate": 3.1473212427077266e-06, "loss": 0.81282234, "num_input_tokens_seen": 116628145, "step": 5428, "time_per_iteration": 2.7186481952667236 }, { "auxiliary_loss_clip": 0.01121911, "auxiliary_loss_mlp": 0.01040908, "balance_loss_clip": 1.04629314, "balance_loss_mlp": 1.02576876, "epoch": 0.3264091387344055, "flos": 16142924367360.0, "grad_norm": 5.016107817785842, "language_loss": 0.71130025, "learning_rate": 3.147002215584023e-06, "loss": 0.7329284, "num_input_tokens_seen": 116646920, "step": 5429, "time_per_iteration": 2.6733968257904053 }, { "auxiliary_loss_clip": 0.01098408, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.04658663, "balance_loss_mlp": 1.0212121, "epoch": 0.3264692619870735, "flos": 16399075230720.0, "grad_norm": 1.7379615094125744, "language_loss": 0.78620625, "learning_rate": 3.146683144965881e-06, "loss": 0.80754858, "num_input_tokens_seen": 116665100, "step": 5430, "time_per_iteration": 2.7313849925994873 }, { "auxiliary_loss_clip": 0.01084979, "auxiliary_loss_mlp": 0.01043143, "balance_loss_clip": 1.04809749, "balance_loss_mlp": 1.02660871, "epoch": 0.32652938523974145, "flos": 22382331315840.0, "grad_norm": 3.4420441965814477, "language_loss": 0.84279943, "learning_rate": 3.146364030865399e-06, "loss": 0.86408061, "num_input_tokens_seen": 116682205, "step": 5431, "time_per_iteration": 2.720797300338745 }, { "auxiliary_loss_clip": 0.01117845, "auxiliary_loss_mlp": 0.01034908, "balance_loss_clip": 1.04730058, "balance_loss_mlp": 1.02067482, "epoch": 0.3265895084924094, "flos": 21908059113600.0, "grad_norm": 1.9482899767939774, "language_loss": 0.70736587, "learning_rate": 3.146044873294678e-06, "loss": 0.7288934, "num_input_tokens_seen": 116702575, "step": 5432, "time_per_iteration": 2.6805124282836914 }, { "auxiliary_loss_clip": 0.01073417, "auxiliary_loss_mlp": 0.01042634, "balance_loss_clip": 1.04051948, "balance_loss_mlp": 1.02625418, "epoch": 0.3266496317450774, "flos": 16067152627200.0, "grad_norm": 1.6263283854003907, "language_loss": 0.84160507, "learning_rate": 3.1457256722658203e-06, "loss": 0.86276555, "num_input_tokens_seen": 116720885, "step": 5433, "time_per_iteration": 2.733450174331665 }, { "auxiliary_loss_clip": 0.01110224, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.04831946, "balance_loss_mlp": 1.01733375, "epoch": 0.3267097549977454, "flos": 22528236360960.0, "grad_norm": 1.8752055231309104, "language_loss": 0.860237, "learning_rate": 3.145406427790931e-06, "loss": 0.881657, "num_input_tokens_seen": 116740395, "step": 5434, "time_per_iteration": 2.6711690425872803 }, { "auxiliary_loss_clip": 0.01115762, "auxiliary_loss_mlp": 0.0104022, "balance_loss_clip": 1.04894018, "balance_loss_mlp": 1.02460361, "epoch": 0.32676987825041337, "flos": 27270419679360.0, "grad_norm": 2.089345873834278, "language_loss": 0.87845808, "learning_rate": 3.1450871398821147e-06, "loss": 0.90001786, "num_input_tokens_seen": 116758870, "step": 5435, "time_per_iteration": 2.7342183589935303 }, { "auxiliary_loss_clip": 0.01137287, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.05190301, "balance_loss_mlp": 1.02256095, "epoch": 0.32683000150308134, "flos": 11508257433600.0, "grad_norm": 3.0926239838125595, "language_loss": 0.7645883, "learning_rate": 3.144767808551479e-06, "loss": 0.78633732, "num_input_tokens_seen": 116773440, "step": 5436, "time_per_iteration": 2.648062229156494 }, { "auxiliary_loss_clip": 0.01137346, "auxiliary_loss_mlp": 0.01034933, "balance_loss_clip": 1.0532552, "balance_loss_mlp": 1.02046728, "epoch": 0.3268901247557493, "flos": 25630200005760.0, "grad_norm": 1.7720337367532448, "language_loss": 0.71802473, "learning_rate": 3.144448433811134e-06, "loss": 0.73974752, "num_input_tokens_seen": 116794375, "step": 5437, "time_per_iteration": 2.680525541305542 }, { "auxiliary_loss_clip": 0.01095966, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04542243, "balance_loss_mlp": 1.02445781, "epoch": 0.32695024800841727, "flos": 24860849575680.0, "grad_norm": 1.7134236857074348, "language_loss": 0.63728261, "learning_rate": 3.144129015673189e-06, "loss": 0.65866441, "num_input_tokens_seen": 116815095, "step": 5438, "time_per_iteration": 2.7343454360961914 }, { "auxiliary_loss_clip": 0.01128746, "auxiliary_loss_mlp": 0.01039734, "balance_loss_clip": 1.05383801, "balance_loss_mlp": 1.02468967, "epoch": 0.32701037126108523, "flos": 28839249072000.0, "grad_norm": 3.854723832885701, "language_loss": 0.74629039, "learning_rate": 3.1438095541497576e-06, "loss": 0.76797515, "num_input_tokens_seen": 116836630, "step": 5439, "time_per_iteration": 2.6859002113342285 }, { "auxiliary_loss_clip": 0.0113034, "auxiliary_loss_mlp": 0.0104413, "balance_loss_clip": 1.05407321, "balance_loss_mlp": 1.02773881, "epoch": 0.3270704945137532, "flos": 27965075777280.0, "grad_norm": 3.9922367032947634, "language_loss": 0.74743968, "learning_rate": 3.1434900492529527e-06, "loss": 0.76918435, "num_input_tokens_seen": 116856880, "step": 5440, "time_per_iteration": 2.6785733699798584 }, { "auxiliary_loss_clip": 0.01124529, "auxiliary_loss_mlp": 0.00773254, "balance_loss_clip": 1.05180979, "balance_loss_mlp": 1.00108397, "epoch": 0.32713061776642116, "flos": 23690700213120.0, "grad_norm": 2.2888111794693033, "language_loss": 0.84642965, "learning_rate": 3.1431705009948914e-06, "loss": 0.86540747, "num_input_tokens_seen": 116873770, "step": 5441, "time_per_iteration": 2.692375421524048 }, { "auxiliary_loss_clip": 0.01126517, "auxiliary_loss_mlp": 0.01042941, "balance_loss_clip": 1.05065203, "balance_loss_mlp": 1.02715778, "epoch": 0.3271907410190891, "flos": 22455625017600.0, "grad_norm": 3.048730330719705, "language_loss": 0.86782062, "learning_rate": 3.1428509093876897e-06, "loss": 0.88951516, "num_input_tokens_seen": 116891225, "step": 5442, "time_per_iteration": 2.6678872108459473 }, { "auxiliary_loss_clip": 0.01105154, "auxiliary_loss_mlp": 0.01041235, "balance_loss_clip": 1.05088091, "balance_loss_mlp": 1.02450991, "epoch": 0.3272508642717571, "flos": 22820118278400.0, "grad_norm": 2.240879974234663, "language_loss": 0.77471602, "learning_rate": 3.1425312744434668e-06, "loss": 0.79617989, "num_input_tokens_seen": 116912300, "step": 5443, "time_per_iteration": 2.715407133102417 }, { "auxiliary_loss_clip": 0.01109692, "auxiliary_loss_mlp": 0.00773391, "balance_loss_clip": 1.05144906, "balance_loss_mlp": 1.00102162, "epoch": 0.32731098752442506, "flos": 11801360413440.0, "grad_norm": 2.595112113661144, "language_loss": 0.81782895, "learning_rate": 3.142211596174343e-06, "loss": 0.83665979, "num_input_tokens_seen": 116929425, "step": 5444, "time_per_iteration": 2.7483620643615723 }, { "auxiliary_loss_clip": 0.0109768, "auxiliary_loss_mlp": 0.01042359, "balance_loss_clip": 1.05127132, "balance_loss_mlp": 1.02671897, "epoch": 0.327371110777093, "flos": 21027780506880.0, "grad_norm": 2.0540771727134786, "language_loss": 0.59668452, "learning_rate": 3.1418918745924423e-06, "loss": 0.61808491, "num_input_tokens_seen": 116948255, "step": 5445, "time_per_iteration": 2.7937049865722656 }, { "auxiliary_loss_clip": 0.01134371, "auxiliary_loss_mlp": 0.01045479, "balance_loss_clip": 1.05779314, "balance_loss_mlp": 1.02935553, "epoch": 0.327431234029761, "flos": 19062102677760.0, "grad_norm": 2.705344105300375, "language_loss": 0.88343978, "learning_rate": 3.1415721097098865e-06, "loss": 0.90523833, "num_input_tokens_seen": 116964905, "step": 5446, "time_per_iteration": 2.586451292037964 }, { "auxiliary_loss_clip": 0.01135097, "auxiliary_loss_mlp": 0.01041409, "balance_loss_clip": 1.0612191, "balance_loss_mlp": 1.02387285, "epoch": 0.32749135728242895, "flos": 25849219184640.0, "grad_norm": 2.2697780368090883, "language_loss": 0.79279661, "learning_rate": 3.141252301538802e-06, "loss": 0.81456167, "num_input_tokens_seen": 116983650, "step": 5447, "time_per_iteration": 2.744072198867798 }, { "auxiliary_loss_clip": 0.01107571, "auxiliary_loss_mlp": 0.00773964, "balance_loss_clip": 1.04747021, "balance_loss_mlp": 1.00110793, "epoch": 0.327551480535097, "flos": 20120533764480.0, "grad_norm": 1.8015667711206929, "language_loss": 0.73182315, "learning_rate": 3.1409324500913157e-06, "loss": 0.75063848, "num_input_tokens_seen": 117003265, "step": 5448, "time_per_iteration": 2.6825077533721924 }, { "auxiliary_loss_clip": 0.01142648, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.05620432, "balance_loss_mlp": 1.02694106, "epoch": 0.32761160378776494, "flos": 28803553931520.0, "grad_norm": 1.4660761852129829, "language_loss": 0.67103487, "learning_rate": 3.1406125553795567e-06, "loss": 0.69288433, "num_input_tokens_seen": 117025370, "step": 5449, "time_per_iteration": 2.682499885559082 }, { "auxiliary_loss_clip": 0.0110995, "auxiliary_loss_mlp": 0.010411, "balance_loss_clip": 1.0542469, "balance_loss_mlp": 1.02627623, "epoch": 0.3276717270404329, "flos": 26937778803840.0, "grad_norm": 3.4023702964270943, "language_loss": 0.65110958, "learning_rate": 3.1402926174156556e-06, "loss": 0.67262006, "num_input_tokens_seen": 117044350, "step": 5450, "time_per_iteration": 2.7582857608795166 }, { "auxiliary_loss_clip": 0.0113136, "auxiliary_loss_mlp": 0.01045713, "balance_loss_clip": 1.05517817, "balance_loss_mlp": 1.03021002, "epoch": 0.32773185029310087, "flos": 25338425829120.0, "grad_norm": 1.5880234750249043, "language_loss": 0.77630055, "learning_rate": 3.1399726362117437e-06, "loss": 0.79807132, "num_input_tokens_seen": 117064450, "step": 5451, "time_per_iteration": 2.6543071269989014 }, { "auxiliary_loss_clip": 0.01131184, "auxiliary_loss_mlp": 0.01044056, "balance_loss_clip": 1.05428064, "balance_loss_mlp": 1.02809358, "epoch": 0.32779197354576883, "flos": 26391721271040.0, "grad_norm": 1.913131066587778, "language_loss": 0.70510584, "learning_rate": 3.1396526117799555e-06, "loss": 0.7268582, "num_input_tokens_seen": 117083060, "step": 5452, "time_per_iteration": 2.6963608264923096 }, { "auxiliary_loss_clip": 0.01112229, "auxiliary_loss_mlp": 0.01036592, "balance_loss_clip": 1.048841, "balance_loss_mlp": 1.02223349, "epoch": 0.3278520967984368, "flos": 24899381890560.0, "grad_norm": 2.6287596248848013, "language_loss": 0.78730083, "learning_rate": 3.1393325441324256e-06, "loss": 0.80878907, "num_input_tokens_seen": 117101860, "step": 5453, "time_per_iteration": 4.197263479232788 }, { "auxiliary_loss_clip": 0.01130585, "auxiliary_loss_mlp": 0.01035536, "balance_loss_clip": 1.0526675, "balance_loss_mlp": 1.02026486, "epoch": 0.32791222005110476, "flos": 29752996176000.0, "grad_norm": 5.184832608635382, "language_loss": 0.75771177, "learning_rate": 3.1390124332812916e-06, "loss": 0.77937293, "num_input_tokens_seen": 117123100, "step": 5454, "time_per_iteration": 2.7643721103668213 }, { "auxiliary_loss_clip": 0.01070253, "auxiliary_loss_mlp": 0.01047697, "balance_loss_clip": 1.03818846, "balance_loss_mlp": 1.03363037, "epoch": 0.32797234330377273, "flos": 16508064072960.0, "grad_norm": 2.8017119157252703, "language_loss": 0.76891404, "learning_rate": 3.1386922792386924e-06, "loss": 0.79009354, "num_input_tokens_seen": 117140515, "step": 5455, "time_per_iteration": 4.402290105819702 }, { "auxiliary_loss_clip": 0.01131084, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.05241477, "balance_loss_mlp": 1.02624655, "epoch": 0.3280324665564407, "flos": 26577918397440.0, "grad_norm": 1.6426536912861747, "language_loss": 0.74021912, "learning_rate": 3.138372082016768e-06, "loss": 0.76195538, "num_input_tokens_seen": 117161485, "step": 5456, "time_per_iteration": 2.821965217590332 }, { "auxiliary_loss_clip": 0.01140062, "auxiliary_loss_mlp": 0.01047408, "balance_loss_clip": 1.05334985, "balance_loss_mlp": 1.03212523, "epoch": 0.32809258980910866, "flos": 22929969047040.0, "grad_norm": 1.7597936582740754, "language_loss": 0.78038168, "learning_rate": 3.1380518416276596e-06, "loss": 0.80225635, "num_input_tokens_seen": 117181870, "step": 5457, "time_per_iteration": 2.703756093978882 }, { "auxiliary_loss_clip": 0.01104649, "auxiliary_loss_mlp": 0.01042509, "balance_loss_clip": 1.04943132, "balance_loss_mlp": 1.02752471, "epoch": 0.3281527130617766, "flos": 22783848520320.0, "grad_norm": 5.102364490559591, "language_loss": 0.79493362, "learning_rate": 3.1377315580835115e-06, "loss": 0.81640518, "num_input_tokens_seen": 117201380, "step": 5458, "time_per_iteration": 4.307415962219238 }, { "auxiliary_loss_clip": 0.01124323, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.05467916, "balance_loss_mlp": 1.02362311, "epoch": 0.3282128363144446, "flos": 21250678354560.0, "grad_norm": 1.6160363150508943, "language_loss": 0.73029429, "learning_rate": 3.1374112313964686e-06, "loss": 0.7519297, "num_input_tokens_seen": 117221040, "step": 5459, "time_per_iteration": 2.678131341934204 }, { "auxiliary_loss_clip": 0.01118921, "auxiliary_loss_mlp": 0.01041188, "balance_loss_clip": 1.05190325, "balance_loss_mlp": 1.02591753, "epoch": 0.32827295956711255, "flos": 30843064166400.0, "grad_norm": 2.011905165126453, "language_loss": 0.84018445, "learning_rate": 3.1370908615786783e-06, "loss": 0.86178553, "num_input_tokens_seen": 117241395, "step": 5460, "time_per_iteration": 5.767046213150024 }, { "auxiliary_loss_clip": 0.01138817, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.05174541, "balance_loss_mlp": 1.02029121, "epoch": 0.3283330828197806, "flos": 25915006944000.0, "grad_norm": 1.9959413021835115, "language_loss": 0.76553524, "learning_rate": 3.136770448642288e-06, "loss": 0.78727543, "num_input_tokens_seen": 117259340, "step": 5461, "time_per_iteration": 2.673659086227417 }, { "auxiliary_loss_clip": 0.01121607, "auxiliary_loss_mlp": 0.01042243, "balance_loss_clip": 1.05065536, "balance_loss_mlp": 1.02489805, "epoch": 0.32839320607244854, "flos": 38582065042560.0, "grad_norm": 2.148112131584704, "language_loss": 0.62898672, "learning_rate": 3.1364499925994484e-06, "loss": 0.65062523, "num_input_tokens_seen": 117282375, "step": 5462, "time_per_iteration": 2.789217472076416 }, { "auxiliary_loss_clip": 0.01136727, "auxiliary_loss_mlp": 0.0077334, "balance_loss_clip": 1.05279326, "balance_loss_mlp": 1.00113511, "epoch": 0.3284533293251165, "flos": 26650888876800.0, "grad_norm": 2.4415591889879056, "language_loss": 0.7805075, "learning_rate": 3.1361294934623115e-06, "loss": 0.79960817, "num_input_tokens_seen": 117303830, "step": 5463, "time_per_iteration": 2.6797146797180176 }, { "auxiliary_loss_clip": 0.01109773, "auxiliary_loss_mlp": 0.01040868, "balance_loss_clip": 1.05036163, "balance_loss_mlp": 1.02523983, "epoch": 0.32851345257778447, "flos": 15304158904320.0, "grad_norm": 1.8407799027990368, "language_loss": 0.70095646, "learning_rate": 3.1358089512430303e-06, "loss": 0.72246289, "num_input_tokens_seen": 117320665, "step": 5464, "time_per_iteration": 2.7286477088928223 }, { "auxiliary_loss_clip": 0.01130175, "auxiliary_loss_mlp": 0.01038523, "balance_loss_clip": 1.05659711, "balance_loss_mlp": 1.02327609, "epoch": 0.32857357583045244, "flos": 23513732881920.0, "grad_norm": 1.976060055551124, "language_loss": 0.72474623, "learning_rate": 3.1354883659537594e-06, "loss": 0.74643314, "num_input_tokens_seen": 117339795, "step": 5465, "time_per_iteration": 2.6666364669799805 }, { "auxiliary_loss_clip": 0.01113042, "auxiliary_loss_mlp": 0.01049431, "balance_loss_clip": 1.05094242, "balance_loss_mlp": 1.03334332, "epoch": 0.3286336990831204, "flos": 20995209849600.0, "grad_norm": 1.953344541818443, "language_loss": 0.832214, "learning_rate": 3.1351677376066567e-06, "loss": 0.8538388, "num_input_tokens_seen": 117359525, "step": 5466, "time_per_iteration": 2.7432901859283447 }, { "auxiliary_loss_clip": 0.01113455, "auxiliary_loss_mlp": 0.01041029, "balance_loss_clip": 1.04729056, "balance_loss_mlp": 1.02577055, "epoch": 0.32869382233578837, "flos": 23658811914240.0, "grad_norm": 1.7893036060845653, "language_loss": 0.79221183, "learning_rate": 3.134847066213879e-06, "loss": 0.8137567, "num_input_tokens_seen": 117380320, "step": 5467, "time_per_iteration": 2.701490879058838 }, { "auxiliary_loss_clip": 0.0111678, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.05045676, "balance_loss_mlp": 1.01759124, "epoch": 0.32875394558845633, "flos": 25336522408320.0, "grad_norm": 1.5411251384559923, "language_loss": 0.74338531, "learning_rate": 3.134526351787587e-06, "loss": 0.76488233, "num_input_tokens_seen": 117400695, "step": 5468, "time_per_iteration": 2.6820507049560547 }, { "auxiliary_loss_clip": 0.0111552, "auxiliary_loss_mlp": 0.01042549, "balance_loss_clip": 1.05065966, "balance_loss_mlp": 1.02476263, "epoch": 0.3288140688411243, "flos": 14903108576640.0, "grad_norm": 1.9818058078172698, "language_loss": 0.7869612, "learning_rate": 3.134205594339942e-06, "loss": 0.80854189, "num_input_tokens_seen": 117418800, "step": 5469, "time_per_iteration": 2.6281590461730957 }, { "auxiliary_loss_clip": 0.01104752, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.04863441, "balance_loss_mlp": 1.01838851, "epoch": 0.32887419209379226, "flos": 18551345235840.0, "grad_norm": 1.9383846382167882, "language_loss": 0.81744516, "learning_rate": 3.133884793883107e-06, "loss": 0.8388238, "num_input_tokens_seen": 117438220, "step": 5470, "time_per_iteration": 2.8643784523010254 }, { "auxiliary_loss_clip": 0.01140563, "auxiliary_loss_mlp": 0.01045939, "balance_loss_clip": 1.05232358, "balance_loss_mlp": 1.03021562, "epoch": 0.3289343153464602, "flos": 48105610439040.0, "grad_norm": 2.0914054865715768, "language_loss": 0.67699564, "learning_rate": 3.1335639504292478e-06, "loss": 0.69886065, "num_input_tokens_seen": 117462560, "step": 5471, "time_per_iteration": 2.851717948913574 }, { "auxiliary_loss_clip": 0.01148136, "auxiliary_loss_mlp": 0.01043561, "balance_loss_clip": 1.05701339, "balance_loss_mlp": 1.02594161, "epoch": 0.3289944385991282, "flos": 27600295207680.0, "grad_norm": 2.097557855250848, "language_loss": 0.64926231, "learning_rate": 3.1332430639905288e-06, "loss": 0.67117929, "num_input_tokens_seen": 117483665, "step": 5472, "time_per_iteration": 2.6586108207702637 }, { "auxiliary_loss_clip": 0.01128351, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.05333138, "balance_loss_mlp": 1.02850199, "epoch": 0.32905456185179616, "flos": 20120318282880.0, "grad_norm": 3.4668570750263155, "language_loss": 0.88257217, "learning_rate": 3.13292213457912e-06, "loss": 0.90431023, "num_input_tokens_seen": 117503565, "step": 5473, "time_per_iteration": 2.6792144775390625 }, { "auxiliary_loss_clip": 0.01103479, "auxiliary_loss_mlp": 0.01038881, "balance_loss_clip": 1.04814398, "balance_loss_mlp": 1.02123809, "epoch": 0.3291146851044642, "flos": 23180230080000.0, "grad_norm": 1.8710184691373295, "language_loss": 0.78193343, "learning_rate": 3.1326011622071903e-06, "loss": 0.80335701, "num_input_tokens_seen": 117521460, "step": 5474, "time_per_iteration": 2.739057779312134 }, { "auxiliary_loss_clip": 0.01038022, "auxiliary_loss_mlp": 0.01029239, "balance_loss_clip": 1.02788568, "balance_loss_mlp": 1.02673554, "epoch": 0.32917480835713214, "flos": 67621912594560.0, "grad_norm": 0.8109823017171686, "language_loss": 0.6018818, "learning_rate": 3.132280146886911e-06, "loss": 0.62255442, "num_input_tokens_seen": 117580550, "step": 5475, "time_per_iteration": 3.196384906768799 }, { "auxiliary_loss_clip": 0.01091837, "auxiliary_loss_mlp": 0.01057279, "balance_loss_clip": 1.04454446, "balance_loss_mlp": 1.03726411, "epoch": 0.3292349316098001, "flos": 27964537073280.0, "grad_norm": 4.962450920257536, "language_loss": 0.76504046, "learning_rate": 3.131959088630455e-06, "loss": 0.78653169, "num_input_tokens_seen": 117600645, "step": 5476, "time_per_iteration": 2.7369961738586426 }, { "auxiliary_loss_clip": 0.01100541, "auxiliary_loss_mlp": 0.01044762, "balance_loss_clip": 1.04824603, "balance_loss_mlp": 1.02946782, "epoch": 0.3292950548624681, "flos": 20263673462400.0, "grad_norm": 2.5019671735892937, "language_loss": 0.74746907, "learning_rate": 3.131637987449997e-06, "loss": 0.76892209, "num_input_tokens_seen": 117618880, "step": 5477, "time_per_iteration": 2.814467430114746 }, { "auxiliary_loss_clip": 0.01135692, "auxiliary_loss_mlp": 0.01042652, "balance_loss_clip": 1.05235898, "balance_loss_mlp": 1.02838814, "epoch": 0.32935517811513604, "flos": 20812999132800.0, "grad_norm": 3.9065130557825234, "language_loss": 0.75539625, "learning_rate": 3.131316843357713e-06, "loss": 0.77717972, "num_input_tokens_seen": 117636445, "step": 5478, "time_per_iteration": 2.730445384979248 }, { "auxiliary_loss_clip": 0.0112467, "auxiliary_loss_mlp": 0.01042056, "balance_loss_clip": 1.04921985, "balance_loss_mlp": 1.02750051, "epoch": 0.329415301367804, "flos": 18441853603200.0, "grad_norm": 2.855777191383278, "language_loss": 0.80462509, "learning_rate": 3.1309956563657807e-06, "loss": 0.82629234, "num_input_tokens_seen": 117653105, "step": 5479, "time_per_iteration": 2.6443796157836914 }, { "auxiliary_loss_clip": 0.01037863, "auxiliary_loss_mlp": 0.01000413, "balance_loss_clip": 1.02671266, "balance_loss_mlp": 0.99823159, "epoch": 0.32947542462047197, "flos": 66323024887680.0, "grad_norm": 0.7530723778079996, "language_loss": 0.56519568, "learning_rate": 3.1306744264863804e-06, "loss": 0.58557844, "num_input_tokens_seen": 117719225, "step": 5480, "time_per_iteration": 3.213240146636963 }, { "auxiliary_loss_clip": 0.01124019, "auxiliary_loss_mlp": 0.00774449, "balance_loss_clip": 1.04898739, "balance_loss_mlp": 1.00116146, "epoch": 0.32953554787313993, "flos": 23221599569280.0, "grad_norm": 1.7923941739082951, "language_loss": 0.77444887, "learning_rate": 3.1303531537316915e-06, "loss": 0.79343355, "num_input_tokens_seen": 117738725, "step": 5481, "time_per_iteration": 2.6905598640441895 }, { "auxiliary_loss_clip": 0.01119194, "auxiliary_loss_mlp": 0.01050738, "balance_loss_clip": 1.05167091, "balance_loss_mlp": 1.03557408, "epoch": 0.3295956711258079, "flos": 27009492307200.0, "grad_norm": 1.5874205685036498, "language_loss": 0.78222132, "learning_rate": 3.130031838113899e-06, "loss": 0.80392069, "num_input_tokens_seen": 117757765, "step": 5482, "time_per_iteration": 2.765235424041748 }, { "auxiliary_loss_clip": 0.01130055, "auxiliary_loss_mlp": 0.01052605, "balance_loss_clip": 1.05121589, "balance_loss_mlp": 1.03674388, "epoch": 0.32965579437847586, "flos": 19171702051200.0, "grad_norm": 2.9405789595849385, "language_loss": 0.73674762, "learning_rate": 3.129710479645185e-06, "loss": 0.75857425, "num_input_tokens_seen": 117776810, "step": 5483, "time_per_iteration": 2.624969005584717 }, { "auxiliary_loss_clip": 0.01122896, "auxiliary_loss_mlp": 0.01054419, "balance_loss_clip": 1.05069685, "balance_loss_mlp": 1.03886831, "epoch": 0.32971591763114383, "flos": 30482521401600.0, "grad_norm": 1.8706124903497952, "language_loss": 0.75649381, "learning_rate": 3.1293890783377366e-06, "loss": 0.77826691, "num_input_tokens_seen": 117797730, "step": 5484, "time_per_iteration": 2.7650864124298096 }, { "auxiliary_loss_clip": 0.01141223, "auxiliary_loss_mlp": 0.01053478, "balance_loss_clip": 1.05515027, "balance_loss_mlp": 1.03807664, "epoch": 0.3297760408838118, "flos": 16289583598080.0, "grad_norm": 72.4202789440072, "language_loss": 0.71719176, "learning_rate": 3.129067634203742e-06, "loss": 0.73913872, "num_input_tokens_seen": 117815365, "step": 5485, "time_per_iteration": 2.603039264678955 }, { "auxiliary_loss_clip": 0.01081054, "auxiliary_loss_mlp": 0.01052335, "balance_loss_clip": 1.04921818, "balance_loss_mlp": 1.03822041, "epoch": 0.32983616413647976, "flos": 29530924341120.0, "grad_norm": 1.6108204077161399, "language_loss": 0.80275488, "learning_rate": 3.128746147255388e-06, "loss": 0.82408869, "num_input_tokens_seen": 117836095, "step": 5486, "time_per_iteration": 2.8364202976226807 }, { "auxiliary_loss_clip": 0.01106188, "auxiliary_loss_mlp": 0.01053006, "balance_loss_clip": 1.04739475, "balance_loss_mlp": 1.03650784, "epoch": 0.3298962873891478, "flos": 20631398947200.0, "grad_norm": 2.173231613182175, "language_loss": 0.84374005, "learning_rate": 3.1284246175048683e-06, "loss": 0.86533195, "num_input_tokens_seen": 117854655, "step": 5487, "time_per_iteration": 2.7796428203582764 }, { "auxiliary_loss_clip": 0.01087509, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04317069, "balance_loss_mlp": 1.0379355, "epoch": 0.32995641064181574, "flos": 14976007228800.0, "grad_norm": 2.633362688401157, "language_loss": 0.74667275, "learning_rate": 3.1281030449643735e-06, "loss": 0.76809955, "num_input_tokens_seen": 117873300, "step": 5488, "time_per_iteration": 2.7173233032226562 }, { "auxiliary_loss_clip": 0.01143363, "auxiliary_loss_mlp": 0.01051325, "balance_loss_clip": 1.05679107, "balance_loss_mlp": 1.03563726, "epoch": 0.3300165338944837, "flos": 18661447399680.0, "grad_norm": 2.518818086418956, "language_loss": 0.71718305, "learning_rate": 3.127781429646098e-06, "loss": 0.7391299, "num_input_tokens_seen": 117891540, "step": 5489, "time_per_iteration": 2.6647188663482666 }, { "auxiliary_loss_clip": 0.01137372, "auxiliary_loss_mlp": 0.01044261, "balance_loss_clip": 1.05154073, "balance_loss_mlp": 1.02973497, "epoch": 0.3300766571471517, "flos": 25583730785280.0, "grad_norm": 6.067113992727344, "language_loss": 0.88346136, "learning_rate": 3.127459771562238e-06, "loss": 0.90527773, "num_input_tokens_seen": 117907690, "step": 5490, "time_per_iteration": 2.594193696975708 }, { "auxiliary_loss_clip": 0.01127009, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.05081856, "balance_loss_mlp": 1.02396214, "epoch": 0.33013678039981964, "flos": 11363501623680.0, "grad_norm": 5.091693260582257, "language_loss": 0.83396459, "learning_rate": 3.1271380707249907e-06, "loss": 0.85562241, "num_input_tokens_seen": 117925640, "step": 5491, "time_per_iteration": 2.6124439239501953 }, { "auxiliary_loss_clip": 0.01111643, "auxiliary_loss_mlp": 0.01048849, "balance_loss_clip": 1.05066538, "balance_loss_mlp": 1.03372788, "epoch": 0.3301969036524876, "flos": 24821203939200.0, "grad_norm": 1.9936853829327341, "language_loss": 0.77453989, "learning_rate": 3.126816327146554e-06, "loss": 0.79614484, "num_input_tokens_seen": 117944525, "step": 5492, "time_per_iteration": 4.26681923866272 }, { "auxiliary_loss_clip": 0.01144384, "auxiliary_loss_mlp": 0.01046422, "balance_loss_clip": 1.05559993, "balance_loss_mlp": 1.02987576, "epoch": 0.33025702690515557, "flos": 15961144613760.0, "grad_norm": 2.586093125227841, "language_loss": 0.74295127, "learning_rate": 3.12649454083913e-06, "loss": 0.76485932, "num_input_tokens_seen": 117962515, "step": 5493, "time_per_iteration": 2.572657585144043 }, { "auxiliary_loss_clip": 0.01007495, "auxiliary_loss_mlp": 0.01051184, "balance_loss_clip": 1.0238874, "balance_loss_mlp": 1.0491215, "epoch": 0.33031715015782354, "flos": 59416755989760.0, "grad_norm": 0.7952972655943692, "language_loss": 0.53981996, "learning_rate": 3.12617271181492e-06, "loss": 0.5604068, "num_input_tokens_seen": 118018780, "step": 5494, "time_per_iteration": 3.2123944759368896 }, { "auxiliary_loss_clip": 0.01114646, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 1.04879999, "balance_loss_mlp": 1.02241075, "epoch": 0.3303772734104915, "flos": 23184360144000.0, "grad_norm": 1.4867113292626302, "language_loss": 0.87236047, "learning_rate": 3.1258508400861276e-06, "loss": 0.89388549, "num_input_tokens_seen": 118038610, "step": 5495, "time_per_iteration": 4.180245637893677 }, { "auxiliary_loss_clip": 0.01104415, "auxiliary_loss_mlp": 0.0104461, "balance_loss_clip": 1.0520072, "balance_loss_mlp": 1.02813482, "epoch": 0.33043739666315947, "flos": 33071896010880.0, "grad_norm": 2.0634169818588157, "language_loss": 0.73468459, "learning_rate": 3.1255289256649587e-06, "loss": 0.7561748, "num_input_tokens_seen": 118055905, "step": 5496, "time_per_iteration": 2.816849946975708 }, { "auxiliary_loss_clip": 0.01107244, "auxiliary_loss_mlp": 0.01039897, "balance_loss_clip": 1.04852057, "balance_loss_mlp": 1.02469766, "epoch": 0.33049751991582743, "flos": 24895431394560.0, "grad_norm": 2.430684839051296, "language_loss": 0.72464252, "learning_rate": 3.1252069685636196e-06, "loss": 0.74611384, "num_input_tokens_seen": 118073695, "step": 5497, "time_per_iteration": 4.314718961715698 }, { "auxiliary_loss_clip": 0.01111966, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 1.05051875, "balance_loss_mlp": 1.02313733, "epoch": 0.3305576431684954, "flos": 29460575554560.0, "grad_norm": 1.9082848646705384, "language_loss": 0.804672, "learning_rate": 3.124884968794321e-06, "loss": 0.82617176, "num_input_tokens_seen": 118094030, "step": 5498, "time_per_iteration": 2.831347942352295 }, { "auxiliary_loss_clip": 0.01121599, "auxiliary_loss_mlp": 0.01041664, "balance_loss_clip": 1.04826963, "balance_loss_mlp": 1.02467656, "epoch": 0.33061776642116336, "flos": 22632305040000.0, "grad_norm": 2.0593804502858823, "language_loss": 0.75822198, "learning_rate": 3.12456292636927e-06, "loss": 0.77985466, "num_input_tokens_seen": 118111665, "step": 5499, "time_per_iteration": 4.880478858947754 }, { "auxiliary_loss_clip": 0.01119724, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.05307007, "balance_loss_mlp": 1.02016318, "epoch": 0.3306778896738313, "flos": 25776320532480.0, "grad_norm": 2.088317081581358, "language_loss": 0.78981787, "learning_rate": 3.124240841300681e-06, "loss": 0.81137192, "num_input_tokens_seen": 118132435, "step": 5500, "time_per_iteration": 2.7601048946380615 }, { "auxiliary_loss_clip": 0.01131843, "auxiliary_loss_mlp": 0.0103364, "balance_loss_clip": 1.0540576, "balance_loss_mlp": 1.01751041, "epoch": 0.33073801292649935, "flos": 36940552479360.0, "grad_norm": 8.499573931934933, "language_loss": 0.6655246, "learning_rate": 3.1239187136007665e-06, "loss": 0.68717939, "num_input_tokens_seen": 118155255, "step": 5501, "time_per_iteration": 2.7880568504333496 }, { "auxiliary_loss_clip": 0.01130024, "auxiliary_loss_mlp": 0.01044854, "balance_loss_clip": 1.05215073, "balance_loss_mlp": 1.02766418, "epoch": 0.3307981361791673, "flos": 12967738848000.0, "grad_norm": 2.417495150038941, "language_loss": 0.77221018, "learning_rate": 3.1235965432817417e-06, "loss": 0.79395902, "num_input_tokens_seen": 118169865, "step": 5502, "time_per_iteration": 2.621891736984253 }, { "auxiliary_loss_clip": 0.01120279, "auxiliary_loss_mlp": 0.01041312, "balance_loss_clip": 1.05816746, "balance_loss_mlp": 1.02508807, "epoch": 0.3308582594318353, "flos": 25374372364800.0, "grad_norm": 1.6870244228079128, "language_loss": 0.72882998, "learning_rate": 3.123274330355824e-06, "loss": 0.75044584, "num_input_tokens_seen": 118190760, "step": 5503, "time_per_iteration": 2.731391191482544 }, { "auxiliary_loss_clip": 0.01107126, "auxiliary_loss_mlp": 0.01042991, "balance_loss_clip": 1.04483843, "balance_loss_mlp": 1.02543116, "epoch": 0.33091838268450324, "flos": 26468570419200.0, "grad_norm": 1.6983408951831631, "language_loss": 0.75341403, "learning_rate": 3.12295207483523e-06, "loss": 0.77491516, "num_input_tokens_seen": 118213620, "step": 5504, "time_per_iteration": 2.734440565109253 }, { "auxiliary_loss_clip": 0.01116159, "auxiliary_loss_mlp": 0.01038384, "balance_loss_clip": 1.05076432, "balance_loss_mlp": 1.02267826, "epoch": 0.3309785059371712, "flos": 24971167221120.0, "grad_norm": 1.5921827086772462, "language_loss": 0.69537103, "learning_rate": 3.1226297767321816e-06, "loss": 0.71691644, "num_input_tokens_seen": 118235010, "step": 5505, "time_per_iteration": 2.7224769592285156 }, { "auxiliary_loss_clip": 0.0112242, "auxiliary_loss_mlp": 0.01050735, "balance_loss_clip": 1.04997373, "balance_loss_mlp": 1.03454661, "epoch": 0.3310386291898392, "flos": 20446710192000.0, "grad_norm": 1.6566524839278514, "language_loss": 0.81701219, "learning_rate": 3.122307436058899e-06, "loss": 0.83874375, "num_input_tokens_seen": 118255820, "step": 5506, "time_per_iteration": 2.6608633995056152 }, { "auxiliary_loss_clip": 0.01126393, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.05129898, "balance_loss_mlp": 1.02032042, "epoch": 0.33109875244250714, "flos": 23182672204800.0, "grad_norm": 2.1165262291534663, "language_loss": 0.7961843, "learning_rate": 3.121985052827606e-06, "loss": 0.81781757, "num_input_tokens_seen": 118274160, "step": 5507, "time_per_iteration": 2.6279826164245605 }, { "auxiliary_loss_clip": 0.01115407, "auxiliary_loss_mlp": 0.0104488, "balance_loss_clip": 1.04948068, "balance_loss_mlp": 1.02901316, "epoch": 0.3311588756951751, "flos": 24168384207360.0, "grad_norm": 1.8252383106416188, "language_loss": 0.71632457, "learning_rate": 3.1216626270505274e-06, "loss": 0.73792744, "num_input_tokens_seen": 118294385, "step": 5508, "time_per_iteration": 2.666274070739746 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01035431, "balance_loss_clip": 1.04841506, "balance_loss_mlp": 1.02048194, "epoch": 0.33121899894784307, "flos": 28145742209280.0, "grad_norm": 2.0681023318662053, "language_loss": 0.71877921, "learning_rate": 3.12134015873989e-06, "loss": 0.74018759, "num_input_tokens_seen": 118313105, "step": 5509, "time_per_iteration": 2.9805185794830322 }, { "auxiliary_loss_clip": 0.01123913, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.05431342, "balance_loss_mlp": 1.02019095, "epoch": 0.33127912220051103, "flos": 29567660976000.0, "grad_norm": 1.690455092128618, "language_loss": 0.72850806, "learning_rate": 3.121017647907921e-06, "loss": 0.75010473, "num_input_tokens_seen": 118335250, "step": 5510, "time_per_iteration": 2.7012648582458496 }, { "auxiliary_loss_clip": 0.01097101, "auxiliary_loss_mlp": 0.01036395, "balance_loss_clip": 1.04754674, "balance_loss_mlp": 1.02099323, "epoch": 0.331339245453179, "flos": 14428836374400.0, "grad_norm": 2.529653220973509, "language_loss": 0.87842733, "learning_rate": 3.1206950945668508e-06, "loss": 0.89976227, "num_input_tokens_seen": 118351470, "step": 5511, "time_per_iteration": 2.699303150177002 }, { "auxiliary_loss_clip": 0.01077351, "auxiliary_loss_mlp": 0.0103825, "balance_loss_clip": 1.04569423, "balance_loss_mlp": 1.0232892, "epoch": 0.33139936870584696, "flos": 20887118847360.0, "grad_norm": 2.0800696693803404, "language_loss": 0.73301774, "learning_rate": 3.12037249872891e-06, "loss": 0.7541737, "num_input_tokens_seen": 118370970, "step": 5512, "time_per_iteration": 2.773071765899658 }, { "auxiliary_loss_clip": 0.01092657, "auxiliary_loss_mlp": 0.01037164, "balance_loss_clip": 1.04608238, "balance_loss_mlp": 1.02226281, "epoch": 0.33145949195851493, "flos": 36284356869120.0, "grad_norm": 28.686212163123738, "language_loss": 0.7188127, "learning_rate": 3.1200498604063317e-06, "loss": 0.74011087, "num_input_tokens_seen": 118393125, "step": 5513, "time_per_iteration": 2.832712411880493 }, { "auxiliary_loss_clip": 0.0110331, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.0480994, "balance_loss_mlp": 1.02052951, "epoch": 0.33151961521118295, "flos": 14279735018880.0, "grad_norm": 1.9100766123367274, "language_loss": 0.68260789, "learning_rate": 3.1197271796113507e-06, "loss": 0.70401114, "num_input_tokens_seen": 118410860, "step": 5514, "time_per_iteration": 2.62347674369812 }, { "auxiliary_loss_clip": 0.01111479, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.04936767, "balance_loss_mlp": 1.02481997, "epoch": 0.3315797384638509, "flos": 20774323163520.0, "grad_norm": 1.9179680687741931, "language_loss": 0.65994096, "learning_rate": 3.1194044563562026e-06, "loss": 0.68148118, "num_input_tokens_seen": 118429570, "step": 5515, "time_per_iteration": 2.6913952827453613 }, { "auxiliary_loss_clip": 0.01121539, "auxiliary_loss_mlp": 0.01039988, "balance_loss_clip": 1.04903245, "balance_loss_mlp": 1.02393019, "epoch": 0.3316398617165189, "flos": 24679464871680.0, "grad_norm": 1.8088538037879305, "language_loss": 0.69273043, "learning_rate": 3.1190816906531257e-06, "loss": 0.71434575, "num_input_tokens_seen": 118450285, "step": 5516, "time_per_iteration": 2.6469173431396484 }, { "auxiliary_loss_clip": 0.011287, "auxiliary_loss_mlp": 0.01039737, "balance_loss_clip": 1.05089724, "balance_loss_mlp": 1.02339315, "epoch": 0.33169998496918685, "flos": 18587974129920.0, "grad_norm": 3.871010712989623, "language_loss": 0.79914033, "learning_rate": 3.118758882514359e-06, "loss": 0.82082474, "num_input_tokens_seen": 118468270, "step": 5517, "time_per_iteration": 2.6387667655944824 }, { "auxiliary_loss_clip": 0.01113973, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.04587924, "balance_loss_mlp": 1.02412271, "epoch": 0.3317601082218548, "flos": 20193647898240.0, "grad_norm": 1.7856922866156533, "language_loss": 0.74043357, "learning_rate": 3.118436031952143e-06, "loss": 0.76197767, "num_input_tokens_seen": 118486615, "step": 5518, "time_per_iteration": 2.6136653423309326 }, { "auxiliary_loss_clip": 0.01035845, "auxiliary_loss_mlp": 0.0100663, "balance_loss_clip": 1.02549803, "balance_loss_mlp": 1.00447261, "epoch": 0.3318202314745228, "flos": 68974703637120.0, "grad_norm": 0.6165261089589951, "language_loss": 0.54330659, "learning_rate": 3.1181131389787206e-06, "loss": 0.56373143, "num_input_tokens_seen": 118553580, "step": 5519, "time_per_iteration": 3.3124027252197266 }, { "auxiliary_loss_clip": 0.01129225, "auxiliary_loss_mlp": 0.01042237, "balance_loss_clip": 1.05353975, "balance_loss_mlp": 1.02483273, "epoch": 0.33188035472719074, "flos": 21500113374720.0, "grad_norm": 2.4445902922344342, "language_loss": 0.78693354, "learning_rate": 3.117790203606336e-06, "loss": 0.80864823, "num_input_tokens_seen": 118570280, "step": 5520, "time_per_iteration": 2.680413246154785 }, { "auxiliary_loss_clip": 0.0111174, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.04981971, "balance_loss_mlp": 1.01946807, "epoch": 0.3319404779798587, "flos": 28870490926080.0, "grad_norm": 2.1205551001068645, "language_loss": 0.76597643, "learning_rate": 3.1174672258472344e-06, "loss": 0.78743839, "num_input_tokens_seen": 118590455, "step": 5521, "time_per_iteration": 2.7977516651153564 }, { "auxiliary_loss_clip": 0.01128356, "auxiliary_loss_mlp": 0.0104906, "balance_loss_clip": 1.0500772, "balance_loss_mlp": 1.0320611, "epoch": 0.33200060123252667, "flos": 23076915586560.0, "grad_norm": 5.546447388917159, "language_loss": 0.70404172, "learning_rate": 3.117144205713664e-06, "loss": 0.72581589, "num_input_tokens_seen": 118609495, "step": 5522, "time_per_iteration": 2.7343335151672363 }, { "auxiliary_loss_clip": 0.01112615, "auxiliary_loss_mlp": 0.01039333, "balance_loss_clip": 1.04872596, "balance_loss_mlp": 1.02413392, "epoch": 0.33206072448519464, "flos": 21142479611520.0, "grad_norm": 2.5717643633026133, "language_loss": 0.7406925, "learning_rate": 3.1168211432178735e-06, "loss": 0.76221192, "num_input_tokens_seen": 118628720, "step": 5523, "time_per_iteration": 2.6910529136657715 }, { "auxiliary_loss_clip": 0.01108522, "auxiliary_loss_mlp": 0.01039859, "balance_loss_clip": 1.04778576, "balance_loss_mlp": 1.02415287, "epoch": 0.3321208477378626, "flos": 13079097987840.0, "grad_norm": 1.7441145490896364, "language_loss": 0.82432246, "learning_rate": 3.116498038372114e-06, "loss": 0.8458063, "num_input_tokens_seen": 118645955, "step": 5524, "time_per_iteration": 2.747279405593872 }, { "auxiliary_loss_clip": 0.01094215, "auxiliary_loss_mlp": 0.00773366, "balance_loss_clip": 1.04763544, "balance_loss_mlp": 1.000983, "epoch": 0.33218097099053057, "flos": 21215414177280.0, "grad_norm": 1.8821817398487202, "language_loss": 0.83040905, "learning_rate": 3.116174891188636e-06, "loss": 0.84908485, "num_input_tokens_seen": 118665605, "step": 5525, "time_per_iteration": 2.7802865505218506 }, { "auxiliary_loss_clip": 0.01051991, "auxiliary_loss_mlp": 0.01009126, "balance_loss_clip": 1.02309918, "balance_loss_mlp": 1.00730228, "epoch": 0.33224109424319853, "flos": 64348979189760.0, "grad_norm": 0.7599038914172829, "language_loss": 0.52588648, "learning_rate": 3.1158517016796945e-06, "loss": 0.54649764, "num_input_tokens_seen": 118728155, "step": 5526, "time_per_iteration": 3.1430625915527344 }, { "auxiliary_loss_clip": 0.01100912, "auxiliary_loss_mlp": 0.00775153, "balance_loss_clip": 1.05235875, "balance_loss_mlp": 1.00101066, "epoch": 0.33230121749586655, "flos": 17346003523200.0, "grad_norm": 1.9434005693126541, "language_loss": 0.77540255, "learning_rate": 3.1155284698575445e-06, "loss": 0.79416323, "num_input_tokens_seen": 118743955, "step": 5527, "time_per_iteration": 2.779862403869629 }, { "auxiliary_loss_clip": 0.01095485, "auxiliary_loss_mlp": 0.01045396, "balance_loss_clip": 1.05338502, "balance_loss_mlp": 1.02997637, "epoch": 0.3323613407485345, "flos": 20997041443200.0, "grad_norm": 2.507974613956182, "language_loss": 0.7222321, "learning_rate": 3.1152051957344434e-06, "loss": 0.7436409, "num_input_tokens_seen": 118763275, "step": 5528, "time_per_iteration": 2.7340548038482666 }, { "auxiliary_loss_clip": 0.01112677, "auxiliary_loss_mlp": 0.01037789, "balance_loss_clip": 1.04796624, "balance_loss_mlp": 1.02333462, "epoch": 0.3324214640012025, "flos": 13152535344000.0, "grad_norm": 1.86583443755271, "language_loss": 0.82796729, "learning_rate": 3.1148818793226497e-06, "loss": 0.84947193, "num_input_tokens_seen": 118781110, "step": 5529, "time_per_iteration": 2.6532175540924072 }, { "auxiliary_loss_clip": 0.01113738, "auxiliary_loss_mlp": 0.00775289, "balance_loss_clip": 1.04990721, "balance_loss_mlp": 1.00095487, "epoch": 0.33248158725387045, "flos": 22273522041600.0, "grad_norm": 2.91854332756289, "language_loss": 0.69676769, "learning_rate": 3.114558520634423e-06, "loss": 0.71565795, "num_input_tokens_seen": 118800620, "step": 5530, "time_per_iteration": 2.708841323852539 }, { "auxiliary_loss_clip": 0.01126266, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.05040276, "balance_loss_mlp": 1.03394794, "epoch": 0.3325417105065384, "flos": 20740998320640.0, "grad_norm": 2.896961644373142, "language_loss": 0.75989115, "learning_rate": 3.1142351196820256e-06, "loss": 0.7816565, "num_input_tokens_seen": 118818725, "step": 5531, "time_per_iteration": 2.672736167907715 }, { "auxiliary_loss_clip": 0.01118495, "auxiliary_loss_mlp": 0.0104264, "balance_loss_clip": 1.05284333, "balance_loss_mlp": 1.0260222, "epoch": 0.3326018337592064, "flos": 24790536702720.0, "grad_norm": 2.0175366752259465, "language_loss": 0.73189509, "learning_rate": 3.1139116764777206e-06, "loss": 0.75350642, "num_input_tokens_seen": 118839390, "step": 5532, "time_per_iteration": 4.367426156997681 }, { "auxiliary_loss_clip": 0.0111545, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.0523479, "balance_loss_mlp": 1.01623583, "epoch": 0.33266195701187434, "flos": 14501699112960.0, "grad_norm": 2.031596721272471, "language_loss": 0.65847003, "learning_rate": 3.1135881910337735e-06, "loss": 0.67993426, "num_input_tokens_seen": 118856275, "step": 5533, "time_per_iteration": 2.66029691696167 }, { "auxiliary_loss_clip": 0.01080696, "auxiliary_loss_mlp": 0.01037858, "balance_loss_clip": 1.04513919, "balance_loss_mlp": 1.02147257, "epoch": 0.3327220802645423, "flos": 15304410299520.0, "grad_norm": 2.349847054242377, "language_loss": 0.71297956, "learning_rate": 3.113264663362451e-06, "loss": 0.73416501, "num_input_tokens_seen": 118873830, "step": 5534, "time_per_iteration": 4.27457070350647 }, { "auxiliary_loss_clip": 0.0109151, "auxiliary_loss_mlp": 0.01041219, "balance_loss_clip": 1.04982436, "balance_loss_mlp": 1.02534652, "epoch": 0.3327822035172103, "flos": 23477534951040.0, "grad_norm": 2.0777718313633997, "language_loss": 0.6718514, "learning_rate": 3.1129410934760204e-06, "loss": 0.69317865, "num_input_tokens_seen": 118891560, "step": 5535, "time_per_iteration": 2.774434804916382 }, { "auxiliary_loss_clip": 0.01126643, "auxiliary_loss_mlp": 0.00774026, "balance_loss_clip": 1.04974341, "balance_loss_mlp": 1.00099397, "epoch": 0.33284232676987824, "flos": 25374516019200.0, "grad_norm": 4.4518317449354905, "language_loss": 0.72757089, "learning_rate": 3.1126174813867517e-06, "loss": 0.74657756, "num_input_tokens_seen": 118910260, "step": 5536, "time_per_iteration": 4.211881399154663 }, { "auxiliary_loss_clip": 0.0112639, "auxiliary_loss_mlp": 0.01042922, "balance_loss_clip": 1.05097485, "balance_loss_mlp": 1.02740741, "epoch": 0.3329024500225462, "flos": 23694363400320.0, "grad_norm": 1.6494647990025764, "language_loss": 0.81951326, "learning_rate": 3.112293827106917e-06, "loss": 0.84120637, "num_input_tokens_seen": 118929985, "step": 5537, "time_per_iteration": 2.723938465118408 }, { "auxiliary_loss_clip": 0.01130953, "auxiliary_loss_mlp": 0.01041699, "balance_loss_clip": 1.05334187, "balance_loss_mlp": 1.02568924, "epoch": 0.33296257327521417, "flos": 31723163205120.0, "grad_norm": 2.0361349610506987, "language_loss": 0.71549797, "learning_rate": 3.111970130648789e-06, "loss": 0.73722446, "num_input_tokens_seen": 118951355, "step": 5538, "time_per_iteration": 4.913949489593506 }, { "auxiliary_loss_clip": 0.01120461, "auxiliary_loss_mlp": 0.01037376, "balance_loss_clip": 1.04746032, "balance_loss_mlp": 1.02189124, "epoch": 0.33302269652788213, "flos": 22744705674240.0, "grad_norm": 1.8849765474814903, "language_loss": 0.74648041, "learning_rate": 3.1116463920246424e-06, "loss": 0.76805872, "num_input_tokens_seen": 118970910, "step": 5539, "time_per_iteration": 2.7290310859680176 }, { "auxiliary_loss_clip": 0.01142521, "auxiliary_loss_mlp": 0.01045266, "balance_loss_clip": 1.05175686, "balance_loss_mlp": 1.02844524, "epoch": 0.33308281978055015, "flos": 11473747441920.0, "grad_norm": 1.7887365250144445, "language_loss": 0.71008205, "learning_rate": 3.1113226112467527e-06, "loss": 0.73195994, "num_input_tokens_seen": 118989200, "step": 5540, "time_per_iteration": 2.6340630054473877 }, { "auxiliary_loss_clip": 0.01121672, "auxiliary_loss_mlp": 0.01037813, "balance_loss_clip": 1.04614174, "balance_loss_mlp": 1.02212477, "epoch": 0.3331429430332181, "flos": 38213693112960.0, "grad_norm": 2.2050863595265535, "language_loss": 0.60332179, "learning_rate": 3.1109987883273983e-06, "loss": 0.62491661, "num_input_tokens_seen": 119011030, "step": 5541, "time_per_iteration": 2.9001681804656982 }, { "auxiliary_loss_clip": 0.01116142, "auxiliary_loss_mlp": 0.01045386, "balance_loss_clip": 1.04896498, "balance_loss_mlp": 1.02827907, "epoch": 0.3332030662858861, "flos": 22528667324160.0, "grad_norm": 1.8682676496278656, "language_loss": 0.68843257, "learning_rate": 3.1106749232788584e-06, "loss": 0.7100479, "num_input_tokens_seen": 119030620, "step": 5542, "time_per_iteration": 2.7336552143096924 }, { "auxiliary_loss_clip": 0.01125827, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.04983997, "balance_loss_mlp": 1.0241369, "epoch": 0.33326318953855405, "flos": 15997773507840.0, "grad_norm": 1.7424785130645766, "language_loss": 0.75545055, "learning_rate": 3.110351016113414e-06, "loss": 0.7771036, "num_input_tokens_seen": 119048015, "step": 5543, "time_per_iteration": 2.7098708152770996 }, { "auxiliary_loss_clip": 0.01059952, "auxiliary_loss_mlp": 0.01049723, "balance_loss_clip": 1.04679465, "balance_loss_mlp": 1.03153133, "epoch": 0.333323312791222, "flos": 25593535198080.0, "grad_norm": 1.720313350609618, "language_loss": 0.75207818, "learning_rate": 3.110027066843348e-06, "loss": 0.77317488, "num_input_tokens_seen": 119066280, "step": 5544, "time_per_iteration": 2.8580381870269775 }, { "auxiliary_loss_clip": 0.01131382, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.0470835, "balance_loss_mlp": 1.01900601, "epoch": 0.33338343604389, "flos": 25119550304640.0, "grad_norm": 1.8195187872515122, "language_loss": 0.70631826, "learning_rate": 3.1097030754809456e-06, "loss": 0.7279768, "num_input_tokens_seen": 119087680, "step": 5545, "time_per_iteration": 2.6675262451171875 }, { "auxiliary_loss_clip": 0.01090227, "auxiliary_loss_mlp": 0.01038197, "balance_loss_clip": 1.04591393, "balance_loss_mlp": 1.0225687, "epoch": 0.33344355929655795, "flos": 16947287579520.0, "grad_norm": 2.0475528286172615, "language_loss": 0.68962657, "learning_rate": 3.1093790420384894e-06, "loss": 0.7109108, "num_input_tokens_seen": 119105820, "step": 5546, "time_per_iteration": 2.6620733737945557 }, { "auxiliary_loss_clip": 0.01099462, "auxiliary_loss_mlp": 0.01039292, "balance_loss_clip": 1.04328573, "balance_loss_mlp": 1.02330589, "epoch": 0.3335036825492259, "flos": 27889591345920.0, "grad_norm": 1.6439201248410251, "language_loss": 0.64893299, "learning_rate": 3.1090549665282702e-06, "loss": 0.67032051, "num_input_tokens_seen": 119126630, "step": 5547, "time_per_iteration": 2.7897326946258545 }, { "auxiliary_loss_clip": 0.0111514, "auxiliary_loss_mlp": 0.0103407, "balance_loss_clip": 1.05108774, "balance_loss_mlp": 1.01957989, "epoch": 0.3335638058018939, "flos": 16179553261440.0, "grad_norm": 2.7266915889905765, "language_loss": 0.85475278, "learning_rate": 3.1087308489625742e-06, "loss": 0.8762449, "num_input_tokens_seen": 119143375, "step": 5548, "time_per_iteration": 2.691776990890503 }, { "auxiliary_loss_clip": 0.0112443, "auxiliary_loss_mlp": 0.01038689, "balance_loss_clip": 1.04759526, "balance_loss_mlp": 1.02190423, "epoch": 0.33362392905456184, "flos": 39896108288640.0, "grad_norm": 2.1593805374763466, "language_loss": 0.74996036, "learning_rate": 3.1084066893536945e-06, "loss": 0.77159154, "num_input_tokens_seen": 119166450, "step": 5549, "time_per_iteration": 2.778918743133545 }, { "auxiliary_loss_clip": 0.01129114, "auxiliary_loss_mlp": 0.01040153, "balance_loss_clip": 1.0509795, "balance_loss_mlp": 1.02330887, "epoch": 0.3336840523072298, "flos": 44271212567040.0, "grad_norm": 2.0942861782322577, "language_loss": 0.6826036, "learning_rate": 3.108082487713921e-06, "loss": 0.70429623, "num_input_tokens_seen": 119189645, "step": 5550, "time_per_iteration": 2.8417065143585205 }, { "auxiliary_loss_clip": 0.01094461, "auxiliary_loss_mlp": 0.01050862, "balance_loss_clip": 1.04752803, "balance_loss_mlp": 1.03398156, "epoch": 0.33374417555989777, "flos": 15085678429440.0, "grad_norm": 3.079168539029832, "language_loss": 0.60630679, "learning_rate": 3.1077582440555495e-06, "loss": 0.62776005, "num_input_tokens_seen": 119208045, "step": 5551, "time_per_iteration": 2.7206614017486572 }, { "auxiliary_loss_clip": 0.01096001, "auxiliary_loss_mlp": 0.01040976, "balance_loss_clip": 1.04871941, "balance_loss_mlp": 1.02429891, "epoch": 0.33380429881256574, "flos": 15849174942720.0, "grad_norm": 5.115117677651213, "language_loss": 0.70642906, "learning_rate": 3.1074339583908746e-06, "loss": 0.72779882, "num_input_tokens_seen": 119224910, "step": 5552, "time_per_iteration": 2.7452614307403564 }, { "auxiliary_loss_clip": 0.0109902, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.04360175, "balance_loss_mlp": 1.02150989, "epoch": 0.33386442206523376, "flos": 13480327883520.0, "grad_norm": 2.544991024269762, "language_loss": 0.82464319, "learning_rate": 3.107109630732192e-06, "loss": 0.84600323, "num_input_tokens_seen": 119243290, "step": 5553, "time_per_iteration": 2.755664110183716 }, { "auxiliary_loss_clip": 0.01115353, "auxiliary_loss_mlp": 0.00774656, "balance_loss_clip": 1.05034745, "balance_loss_mlp": 1.00092673, "epoch": 0.3339245453179017, "flos": 16690669839360.0, "grad_norm": 2.0139615227647343, "language_loss": 0.80920005, "learning_rate": 3.1067852610918017e-06, "loss": 0.82810014, "num_input_tokens_seen": 119261195, "step": 5554, "time_per_iteration": 2.701960563659668 }, { "auxiliary_loss_clip": 0.01127546, "auxiliary_loss_mlp": 0.01043388, "balance_loss_clip": 1.05171227, "balance_loss_mlp": 1.02820015, "epoch": 0.3339846685705697, "flos": 24610624456320.0, "grad_norm": 1.6473304910242343, "language_loss": 0.81187713, "learning_rate": 3.1064608494820032e-06, "loss": 0.83358645, "num_input_tokens_seen": 119282845, "step": 5555, "time_per_iteration": 2.697605609893799 }, { "auxiliary_loss_clip": 0.01120953, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.04721272, "balance_loss_mlp": 1.02425706, "epoch": 0.33404479182323765, "flos": 30953812775040.0, "grad_norm": 1.6543240081497628, "language_loss": 0.74369228, "learning_rate": 3.106136395915099e-06, "loss": 0.76529467, "num_input_tokens_seen": 119304430, "step": 5556, "time_per_iteration": 2.7341341972351074 }, { "auxiliary_loss_clip": 0.01124745, "auxiliary_loss_mlp": 0.0103615, "balance_loss_clip": 1.05016208, "balance_loss_mlp": 1.02102232, "epoch": 0.3341049150759056, "flos": 23513301918720.0, "grad_norm": 1.6367363007204896, "language_loss": 0.82058722, "learning_rate": 3.105811900403391e-06, "loss": 0.84219617, "num_input_tokens_seen": 119323830, "step": 5557, "time_per_iteration": 2.6798059940338135 }, { "auxiliary_loss_clip": 0.01115524, "auxiliary_loss_mlp": 0.01038861, "balance_loss_clip": 1.04990697, "balance_loss_mlp": 1.02333987, "epoch": 0.3341650383285736, "flos": 24026824707840.0, "grad_norm": 1.4529426900334401, "language_loss": 0.80220526, "learning_rate": 3.1054873629591855e-06, "loss": 0.82374907, "num_input_tokens_seen": 119346340, "step": 5558, "time_per_iteration": 2.760270118713379 }, { "auxiliary_loss_clip": 0.01108428, "auxiliary_loss_mlp": 0.01040994, "balance_loss_clip": 1.04822016, "balance_loss_mlp": 1.02628982, "epoch": 0.33422516158124155, "flos": 24901967669760.0, "grad_norm": 1.5625296304307381, "language_loss": 0.8137213, "learning_rate": 3.105162783594788e-06, "loss": 0.83521557, "num_input_tokens_seen": 119367285, "step": 5559, "time_per_iteration": 2.7685365676879883 }, { "auxiliary_loss_clip": 0.01096895, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.04609013, "balance_loss_mlp": 1.02726293, "epoch": 0.3342852848339095, "flos": 18333403464960.0, "grad_norm": 2.3834321283612003, "language_loss": 0.7164095, "learning_rate": 3.1048381623225074e-06, "loss": 0.73780799, "num_input_tokens_seen": 119385370, "step": 5560, "time_per_iteration": 2.721888780593872 }, { "auxiliary_loss_clip": 0.011201, "auxiliary_loss_mlp": 0.01043409, "balance_loss_clip": 1.05215085, "balance_loss_mlp": 1.02716064, "epoch": 0.3343454080865775, "flos": 30046530119040.0, "grad_norm": 2.1203222418546015, "language_loss": 0.75029516, "learning_rate": 3.1045134991546526e-06, "loss": 0.77193022, "num_input_tokens_seen": 119409150, "step": 5561, "time_per_iteration": 2.8445487022399902 }, { "auxiliary_loss_clip": 0.01115063, "auxiliary_loss_mlp": 0.01036711, "balance_loss_clip": 1.05170679, "balance_loss_mlp": 1.02177453, "epoch": 0.33440553133924544, "flos": 16398823835520.0, "grad_norm": 1.6036143049019338, "language_loss": 0.69467896, "learning_rate": 3.1041887941035355e-06, "loss": 0.71619672, "num_input_tokens_seen": 119426475, "step": 5562, "time_per_iteration": 2.664062023162842 }, { "auxiliary_loss_clip": 0.01125323, "auxiliary_loss_mlp": 0.01042082, "balance_loss_clip": 1.05125499, "balance_loss_mlp": 1.02763367, "epoch": 0.3344656545919134, "flos": 24242072958720.0, "grad_norm": 3.5139835262543504, "language_loss": 0.65094876, "learning_rate": 3.1038640471814685e-06, "loss": 0.67262286, "num_input_tokens_seen": 119446900, "step": 5563, "time_per_iteration": 2.70878529548645 }, { "auxiliary_loss_clip": 0.01078552, "auxiliary_loss_mlp": 0.01045974, "balance_loss_clip": 1.04751515, "balance_loss_mlp": 1.0296303, "epoch": 0.3345257778445814, "flos": 52118843149440.0, "grad_norm": 1.4983314251487456, "language_loss": 0.74106556, "learning_rate": 3.103539258400766e-06, "loss": 0.76231086, "num_input_tokens_seen": 119470945, "step": 5564, "time_per_iteration": 3.0751025676727295 }, { "auxiliary_loss_clip": 0.01035298, "auxiliary_loss_mlp": 0.01009529, "balance_loss_clip": 1.03294694, "balance_loss_mlp": 1.00762165, "epoch": 0.33458590109724934, "flos": 68048602254720.0, "grad_norm": 0.7758359845819034, "language_loss": 0.555296, "learning_rate": 3.103214427773745e-06, "loss": 0.57574433, "num_input_tokens_seen": 119529925, "step": 5565, "time_per_iteration": 3.2246947288513184 }, { "auxiliary_loss_clip": 0.01134316, "auxiliary_loss_mlp": 0.01036162, "balance_loss_clip": 1.05123055, "balance_loss_mlp": 1.02145183, "epoch": 0.3346460243499173, "flos": 37414788768000.0, "grad_norm": 2.332924120890769, "language_loss": 0.65000319, "learning_rate": 3.102889555312721e-06, "loss": 0.67170799, "num_input_tokens_seen": 119550700, "step": 5566, "time_per_iteration": 2.8920817375183105 }, { "auxiliary_loss_clip": 0.01115876, "auxiliary_loss_mlp": 0.0103757, "balance_loss_clip": 1.05134845, "balance_loss_mlp": 1.02252626, "epoch": 0.3347061476025853, "flos": 18697358021760.0, "grad_norm": 2.3005222539878436, "language_loss": 0.77525175, "learning_rate": 3.102564641030016e-06, "loss": 0.79678619, "num_input_tokens_seen": 119569295, "step": 5567, "time_per_iteration": 2.82244610786438 }, { "auxiliary_loss_clip": 0.01112911, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.0479182, "balance_loss_mlp": 1.02079725, "epoch": 0.3347662708552533, "flos": 13917827537280.0, "grad_norm": 1.7148039320536435, "language_loss": 0.76432139, "learning_rate": 3.102239684937949e-06, "loss": 0.78582156, "num_input_tokens_seen": 119587375, "step": 5568, "time_per_iteration": 2.689354181289673 }, { "auxiliary_loss_clip": 0.01099358, "auxiliary_loss_mlp": 0.01048314, "balance_loss_clip": 1.04898834, "balance_loss_mlp": 1.03163624, "epoch": 0.33482639410792125, "flos": 19750402068480.0, "grad_norm": 3.260707250765708, "language_loss": 0.70965171, "learning_rate": 3.101914687048842e-06, "loss": 0.73112851, "num_input_tokens_seen": 119604530, "step": 5569, "time_per_iteration": 2.747023344039917 }, { "auxiliary_loss_clip": 0.01099669, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.04569411, "balance_loss_mlp": 1.01819277, "epoch": 0.3348865173605892, "flos": 16102991422080.0, "grad_norm": 2.127450904564192, "language_loss": 0.89788258, "learning_rate": 3.10158964737502e-06, "loss": 0.91922712, "num_input_tokens_seen": 119621025, "step": 5570, "time_per_iteration": 2.810328960418701 }, { "auxiliary_loss_clip": 0.01098742, "auxiliary_loss_mlp": 0.01034906, "balance_loss_clip": 1.04593182, "balance_loss_mlp": 1.01970696, "epoch": 0.3349466406132572, "flos": 25008945350400.0, "grad_norm": 2.0196203016458245, "language_loss": 0.79848439, "learning_rate": 3.101264565928808e-06, "loss": 0.81982088, "num_input_tokens_seen": 119641725, "step": 5571, "time_per_iteration": 4.5300047397613525 }, { "auxiliary_loss_clip": 0.01052126, "auxiliary_loss_mlp": 0.00754598, "balance_loss_clip": 1.02251923, "balance_loss_mlp": 1.0014987, "epoch": 0.33500676386592515, "flos": 54319991564160.0, "grad_norm": 0.8956854098175919, "language_loss": 0.5596205, "learning_rate": 3.1009394427225335e-06, "loss": 0.57768774, "num_input_tokens_seen": 119693560, "step": 5572, "time_per_iteration": 3.0931503772735596 }, { "auxiliary_loss_clip": 0.01137277, "auxiliary_loss_mlp": 0.01047626, "balance_loss_clip": 1.05220318, "balance_loss_mlp": 1.03196192, "epoch": 0.3350668871185931, "flos": 26797332625920.0, "grad_norm": 2.019282888464976, "language_loss": 0.78090006, "learning_rate": 3.1006142777685257e-06, "loss": 0.8027491, "num_input_tokens_seen": 119712935, "step": 5573, "time_per_iteration": 2.710340738296509 }, { "auxiliary_loss_clip": 0.01105804, "auxiliary_loss_mlp": 0.01046551, "balance_loss_clip": 1.05004358, "balance_loss_mlp": 1.02974284, "epoch": 0.3351270103712611, "flos": 33510508986240.0, "grad_norm": 3.3664569303363834, "language_loss": 0.7253201, "learning_rate": 3.1002890710791133e-06, "loss": 0.74684364, "num_input_tokens_seen": 119731680, "step": 5574, "time_per_iteration": 4.390132427215576 }, { "auxiliary_loss_clip": 0.01119913, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 1.04622221, "balance_loss_mlp": 1.01882839, "epoch": 0.33518713362392905, "flos": 26506240807680.0, "grad_norm": 1.806126996337021, "language_loss": 0.87605375, "learning_rate": 3.0999638226666287e-06, "loss": 0.89758873, "num_input_tokens_seen": 119752155, "step": 5575, "time_per_iteration": 2.6650984287261963 }, { "auxiliary_loss_clip": 0.01119423, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.05073953, "balance_loss_mlp": 1.02783298, "epoch": 0.335247256876597, "flos": 17232345912960.0, "grad_norm": 2.5292682388354404, "language_loss": 0.82834053, "learning_rate": 3.0996385325434063e-06, "loss": 0.84998369, "num_input_tokens_seen": 119769195, "step": 5576, "time_per_iteration": 4.143759727478027 }, { "auxiliary_loss_clip": 0.01126035, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.04928613, "balance_loss_mlp": 1.02584612, "epoch": 0.335307380129265, "flos": 25629373992960.0, "grad_norm": 2.62081807641563, "language_loss": 0.72970062, "learning_rate": 3.0993132007217806e-06, "loss": 0.75138342, "num_input_tokens_seen": 119786810, "step": 5577, "time_per_iteration": 4.264250755310059 }, { "auxiliary_loss_clip": 0.01102749, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.05250812, "balance_loss_mlp": 1.02409935, "epoch": 0.33536750338193294, "flos": 19680089195520.0, "grad_norm": 2.2461501835528255, "language_loss": 0.8147049, "learning_rate": 3.0989878272140883e-06, "loss": 0.83613431, "num_input_tokens_seen": 119805395, "step": 5578, "time_per_iteration": 2.748187780380249 }, { "auxiliary_loss_clip": 0.01072311, "auxiliary_loss_mlp": 0.0077377, "balance_loss_clip": 1.04737353, "balance_loss_mlp": 1.00086129, "epoch": 0.3354276266346009, "flos": 18332613365760.0, "grad_norm": 2.081067644088489, "language_loss": 0.72135395, "learning_rate": 3.0986624120326676e-06, "loss": 0.73981476, "num_input_tokens_seen": 119823135, "step": 5579, "time_per_iteration": 2.797891616821289 }, { "auxiliary_loss_clip": 0.0108369, "auxiliary_loss_mlp": 0.01042635, "balance_loss_clip": 1.04664183, "balance_loss_mlp": 1.02608919, "epoch": 0.3354877498872689, "flos": 17858556645120.0, "grad_norm": 2.1516301629227255, "language_loss": 0.81264424, "learning_rate": 3.0983369551898573e-06, "loss": 0.83390749, "num_input_tokens_seen": 119842265, "step": 5580, "time_per_iteration": 2.76359224319458 }, { "auxiliary_loss_clip": 0.01112891, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.04777932, "balance_loss_mlp": 1.01918936, "epoch": 0.3355478731399369, "flos": 24717745791360.0, "grad_norm": 1.787418199208594, "language_loss": 0.78071463, "learning_rate": 3.0980114566980003e-06, "loss": 0.80219114, "num_input_tokens_seen": 119862500, "step": 5581, "time_per_iteration": 2.6893699169158936 }, { "auxiliary_loss_clip": 0.01102381, "auxiliary_loss_mlp": 0.01044533, "balance_loss_clip": 1.04555583, "balance_loss_mlp": 1.02674723, "epoch": 0.33560799639260486, "flos": 16873886136960.0, "grad_norm": 3.5541134032025528, "language_loss": 0.74734783, "learning_rate": 3.0976859165694384e-06, "loss": 0.76881701, "num_input_tokens_seen": 119880160, "step": 5582, "time_per_iteration": 2.750110149383545 }, { "auxiliary_loss_clip": 0.01109205, "auxiliary_loss_mlp": 0.0104468, "balance_loss_clip": 1.04334664, "balance_loss_mlp": 1.02793145, "epoch": 0.3356681196452728, "flos": 18333511205760.0, "grad_norm": 2.0738327777636574, "language_loss": 0.82039702, "learning_rate": 3.0973603348165166e-06, "loss": 0.84193587, "num_input_tokens_seen": 119899040, "step": 5583, "time_per_iteration": 2.629065990447998 }, { "auxiliary_loss_clip": 0.01113126, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.04719925, "balance_loss_mlp": 1.0322051, "epoch": 0.3357282428979408, "flos": 34750612085760.0, "grad_norm": 2.1437775006956814, "language_loss": 0.77524137, "learning_rate": 3.097034711451581e-06, "loss": 0.79684973, "num_input_tokens_seen": 119921120, "step": 5584, "time_per_iteration": 2.9303438663482666 }, { "auxiliary_loss_clip": 0.01115168, "auxiliary_loss_mlp": 0.01043431, "balance_loss_clip": 1.04803944, "balance_loss_mlp": 1.02755225, "epoch": 0.33578836615060875, "flos": 21580087006080.0, "grad_norm": 1.8068970963649096, "language_loss": 0.76473475, "learning_rate": 3.0967090464869795e-06, "loss": 0.78632081, "num_input_tokens_seen": 119940165, "step": 5585, "time_per_iteration": 2.7168867588043213 }, { "auxiliary_loss_clip": 0.01120824, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.04579937, "balance_loss_mlp": 1.02442741, "epoch": 0.3358484894032767, "flos": 24530291688960.0, "grad_norm": 1.8490215812193886, "language_loss": 0.77754235, "learning_rate": 3.0963833399350608e-06, "loss": 0.79915732, "num_input_tokens_seen": 119959730, "step": 5586, "time_per_iteration": 2.88452410697937 }, { "auxiliary_loss_clip": 0.01100333, "auxiliary_loss_mlp": 0.01057166, "balance_loss_clip": 1.0484302, "balance_loss_mlp": 1.03673398, "epoch": 0.3359086126559447, "flos": 22455589104000.0, "grad_norm": 1.6698470723885088, "language_loss": 0.810045, "learning_rate": 3.0960575918081756e-06, "loss": 0.8316201, "num_input_tokens_seen": 119979315, "step": 5587, "time_per_iteration": 2.7335522174835205 }, { "auxiliary_loss_clip": 0.01130777, "auxiliary_loss_mlp": 0.01042735, "balance_loss_clip": 1.04809558, "balance_loss_mlp": 1.02837586, "epoch": 0.33596873590861265, "flos": 16543687386240.0, "grad_norm": 1.8626695130182664, "language_loss": 0.67307252, "learning_rate": 3.095731802118677e-06, "loss": 0.69480765, "num_input_tokens_seen": 119996140, "step": 5588, "time_per_iteration": 2.5910611152648926 }, { "auxiliary_loss_clip": 0.01113468, "auxiliary_loss_mlp": 0.00774774, "balance_loss_clip": 1.04702032, "balance_loss_mlp": 1.0007664, "epoch": 0.3360288591612806, "flos": 31175812782720.0, "grad_norm": 2.758181662666948, "language_loss": 0.70459288, "learning_rate": 3.095405970878919e-06, "loss": 0.72347522, "num_input_tokens_seen": 120017720, "step": 5589, "time_per_iteration": 2.7966625690460205 }, { "auxiliary_loss_clip": 0.01110605, "auxiliary_loss_mlp": 0.01046945, "balance_loss_clip": 1.04478765, "balance_loss_mlp": 1.02951634, "epoch": 0.3360889824139486, "flos": 23696913265920.0, "grad_norm": 6.820816752821097, "language_loss": 0.6717155, "learning_rate": 3.0950800981012567e-06, "loss": 0.69329101, "num_input_tokens_seen": 120036335, "step": 5590, "time_per_iteration": 2.804384231567383 }, { "auxiliary_loss_clip": 0.01107091, "auxiliary_loss_mlp": 0.01044113, "balance_loss_clip": 1.05176187, "balance_loss_mlp": 1.02741194, "epoch": 0.33614910566661654, "flos": 19318109886720.0, "grad_norm": 2.108159500929249, "language_loss": 0.731767, "learning_rate": 3.094754183798047e-06, "loss": 0.75327909, "num_input_tokens_seen": 120056120, "step": 5591, "time_per_iteration": 2.7423245906829834 }, { "auxiliary_loss_clip": 0.01132777, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.04753232, "balance_loss_mlp": 1.02802432, "epoch": 0.3362092289192845, "flos": 16472261191680.0, "grad_norm": 2.4812698890164238, "language_loss": 0.6978277, "learning_rate": 3.0944282279816493e-06, "loss": 0.71958983, "num_input_tokens_seen": 120073650, "step": 5592, "time_per_iteration": 2.624565362930298 }, { "auxiliary_loss_clip": 0.01109265, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.0459764, "balance_loss_mlp": 1.02034986, "epoch": 0.33626935217195253, "flos": 24243581329920.0, "grad_norm": 2.2034044743639676, "language_loss": 0.76362681, "learning_rate": 3.094102230664423e-06, "loss": 0.78507739, "num_input_tokens_seen": 120093260, "step": 5593, "time_per_iteration": 2.7709946632385254 }, { "auxiliary_loss_clip": 0.01100555, "auxiliary_loss_mlp": 0.00775613, "balance_loss_clip": 1.04247713, "balance_loss_mlp": 1.00074506, "epoch": 0.3363294754246205, "flos": 19718765164800.0, "grad_norm": 2.2856177577930876, "language_loss": 0.7229932, "learning_rate": 3.093776191858731e-06, "loss": 0.74175489, "num_input_tokens_seen": 120111830, "step": 5594, "time_per_iteration": 2.7880120277404785 }, { "auxiliary_loss_clip": 0.01079557, "auxiliary_loss_mlp": 0.00778898, "balance_loss_clip": 1.04157269, "balance_loss_mlp": 1.00079668, "epoch": 0.33638959867728846, "flos": 22596286677120.0, "grad_norm": 3.2295215673950293, "language_loss": 0.79940557, "learning_rate": 3.0934501115769363e-06, "loss": 0.81799006, "num_input_tokens_seen": 120130470, "step": 5595, "time_per_iteration": 2.8623924255371094 }, { "auxiliary_loss_clip": 0.01111225, "auxiliary_loss_mlp": 0.01039348, "balance_loss_clip": 1.04694319, "balance_loss_mlp": 1.02456045, "epoch": 0.3364497219299564, "flos": 20994742972800.0, "grad_norm": 3.201033356603963, "language_loss": 0.81473815, "learning_rate": 3.0931239898314037e-06, "loss": 0.83624387, "num_input_tokens_seen": 120150735, "step": 5596, "time_per_iteration": 2.900319814682007 }, { "auxiliary_loss_clip": 0.01113286, "auxiliary_loss_mlp": 0.01044516, "balance_loss_clip": 1.04682481, "balance_loss_mlp": 1.02877986, "epoch": 0.3365098451826244, "flos": 25228610974080.0, "grad_norm": 1.642499178477658, "language_loss": 0.75647599, "learning_rate": 3.0927978266344995e-06, "loss": 0.778054, "num_input_tokens_seen": 120173230, "step": 5597, "time_per_iteration": 2.8402984142303467 }, { "auxiliary_loss_clip": 0.0112326, "auxiliary_loss_mlp": 0.01034747, "balance_loss_clip": 1.04734445, "balance_loss_mlp": 1.01902318, "epoch": 0.33656996843529235, "flos": 24571697091840.0, "grad_norm": 1.910742765655482, "language_loss": 0.78611934, "learning_rate": 3.0924716219985916e-06, "loss": 0.80769938, "num_input_tokens_seen": 120191860, "step": 5598, "time_per_iteration": 2.7380945682525635 }, { "auxiliary_loss_clip": 0.01141013, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.04969454, "balance_loss_mlp": 1.0235827, "epoch": 0.3366300916879603, "flos": 44091120752640.0, "grad_norm": 1.511676842650176, "language_loss": 0.6446076, "learning_rate": 3.0921453759360514e-06, "loss": 0.66643113, "num_input_tokens_seen": 120219195, "step": 5599, "time_per_iteration": 2.845017433166504 }, { "auxiliary_loss_clip": 0.01103042, "auxiliary_loss_mlp": 0.01054079, "balance_loss_clip": 1.04571164, "balance_loss_mlp": 1.03408813, "epoch": 0.3366902149406283, "flos": 13879869840000.0, "grad_norm": 3.0475721260430486, "language_loss": 0.8262403, "learning_rate": 3.091819088459249e-06, "loss": 0.84781146, "num_input_tokens_seen": 120232950, "step": 5600, "time_per_iteration": 2.690335512161255 }, { "auxiliary_loss_clip": 0.01128117, "auxiliary_loss_mlp": 0.01045257, "balance_loss_clip": 1.04780042, "balance_loss_mlp": 1.02822232, "epoch": 0.33675033819329625, "flos": 16253098358400.0, "grad_norm": 2.4530209101601037, "language_loss": 0.83457136, "learning_rate": 3.0914927595805573e-06, "loss": 0.856305, "num_input_tokens_seen": 120248865, "step": 5601, "time_per_iteration": 2.760735034942627 }, { "auxiliary_loss_clip": 0.01122256, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.04873729, "balance_loss_mlp": 1.02092862, "epoch": 0.3368104614459642, "flos": 17055809544960.0, "grad_norm": 2.1704904083215903, "language_loss": 0.83173311, "learning_rate": 3.0911663893123507e-06, "loss": 0.85332292, "num_input_tokens_seen": 120267820, "step": 5602, "time_per_iteration": 2.6818981170654297 }, { "auxiliary_loss_clip": 0.0113558, "auxiliary_loss_mlp": 0.01053921, "balance_loss_clip": 1.04765427, "balance_loss_mlp": 1.03756535, "epoch": 0.3368705846986322, "flos": 17858628472320.0, "grad_norm": 3.8525391607572477, "language_loss": 0.69046748, "learning_rate": 3.0908399776670048e-06, "loss": 0.71236247, "num_input_tokens_seen": 120286540, "step": 5603, "time_per_iteration": 2.6086158752441406 }, { "auxiliary_loss_clip": 0.01116527, "auxiliary_loss_mlp": 0.01042678, "balance_loss_clip": 1.04876411, "balance_loss_mlp": 1.02617979, "epoch": 0.33693070795130015, "flos": 22929502170240.0, "grad_norm": 1.5388557517073465, "language_loss": 0.83146536, "learning_rate": 3.090513524656898e-06, "loss": 0.85305738, "num_input_tokens_seen": 120307305, "step": 5604, "time_per_iteration": 2.7269375324249268 }, { "auxiliary_loss_clip": 0.01095396, "auxiliary_loss_mlp": 0.01043597, "balance_loss_clip": 1.04384422, "balance_loss_mlp": 1.02708673, "epoch": 0.3369908312039681, "flos": 22017443005440.0, "grad_norm": 1.634462052702842, "language_loss": 0.73473096, "learning_rate": 3.090187030294409e-06, "loss": 0.75612092, "num_input_tokens_seen": 120327845, "step": 5605, "time_per_iteration": 2.712197780609131 }, { "auxiliary_loss_clip": 0.0111786, "auxiliary_loss_mlp": 0.01038834, "balance_loss_clip": 1.04761815, "balance_loss_mlp": 1.02235925, "epoch": 0.33705095445663613, "flos": 11801970944640.0, "grad_norm": 3.8834830456250913, "language_loss": 0.83444858, "learning_rate": 3.089860494591919e-06, "loss": 0.85601556, "num_input_tokens_seen": 120343255, "step": 5606, "time_per_iteration": 2.6680989265441895 }, { "auxiliary_loss_clip": 0.01108557, "auxiliary_loss_mlp": 0.01039061, "balance_loss_clip": 1.04293787, "balance_loss_mlp": 1.02370059, "epoch": 0.3371110777093041, "flos": 25046400257280.0, "grad_norm": 2.0409696956182946, "language_loss": 0.67694759, "learning_rate": 3.089533917561809e-06, "loss": 0.69842374, "num_input_tokens_seen": 120361745, "step": 5607, "time_per_iteration": 2.8172407150268555 }, { "auxiliary_loss_clip": 0.01121964, "auxiliary_loss_mlp": 0.01053243, "balance_loss_clip": 1.04604626, "balance_loss_mlp": 1.03458667, "epoch": 0.33717120096197206, "flos": 26579031719040.0, "grad_norm": 1.9822534609557965, "language_loss": 0.70618403, "learning_rate": 3.089207299216464e-06, "loss": 0.72793615, "num_input_tokens_seen": 120380565, "step": 5608, "time_per_iteration": 2.669027090072632 }, { "auxiliary_loss_clip": 0.01055328, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.03931713, "balance_loss_mlp": 1.02449393, "epoch": 0.33723132421464, "flos": 15158541168000.0, "grad_norm": 1.931960515128334, "language_loss": 0.79290974, "learning_rate": 3.088880639568269e-06, "loss": 0.81386876, "num_input_tokens_seen": 120399235, "step": 5609, "time_per_iteration": 2.7859673500061035 }, { "auxiliary_loss_clip": 0.01124996, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.04914641, "balance_loss_mlp": 1.02387619, "epoch": 0.337291447467308, "flos": 23436093634560.0, "grad_norm": 1.7580059679361764, "language_loss": 0.82490408, "learning_rate": 3.0885539386296114e-06, "loss": 0.8465687, "num_input_tokens_seen": 120420095, "step": 5610, "time_per_iteration": 4.319208145141602 }, { "auxiliary_loss_clip": 0.01123032, "auxiliary_loss_mlp": 0.0104256, "balance_loss_clip": 1.0486002, "balance_loss_mlp": 1.02448845, "epoch": 0.33735157071997596, "flos": 17238163916160.0, "grad_norm": 2.0228863025134824, "language_loss": 0.82122159, "learning_rate": 3.088227196412879e-06, "loss": 0.84287751, "num_input_tokens_seen": 120437690, "step": 5611, "time_per_iteration": 2.6127841472625732 }, { "auxiliary_loss_clip": 0.01116485, "auxiliary_loss_mlp": 0.01045036, "balance_loss_clip": 1.04920387, "balance_loss_mlp": 1.02683246, "epoch": 0.3374116939726439, "flos": 28257388657920.0, "grad_norm": 2.0856936331065037, "language_loss": 0.79704899, "learning_rate": 3.0879004129304626e-06, "loss": 0.81866419, "num_input_tokens_seen": 120459240, "step": 5612, "time_per_iteration": 2.7237493991851807 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.04079247, "balance_loss_mlp": 1.02410221, "epoch": 0.3374718172253119, "flos": 35919396731520.0, "grad_norm": 2.390785367991082, "language_loss": 0.70200634, "learning_rate": 3.087573588194753e-06, "loss": 0.7231766, "num_input_tokens_seen": 120481090, "step": 5613, "time_per_iteration": 4.43415379524231 }, { "auxiliary_loss_clip": 0.01118495, "auxiliary_loss_mlp": 0.01037291, "balance_loss_clip": 1.04903054, "balance_loss_mlp": 1.02097178, "epoch": 0.33753194047797985, "flos": 18186672407040.0, "grad_norm": 2.1929626699857585, "language_loss": 0.79407388, "learning_rate": 3.087246722218144e-06, "loss": 0.81563175, "num_input_tokens_seen": 120500045, "step": 5614, "time_per_iteration": 2.6484436988830566 }, { "auxiliary_loss_clip": 0.01105902, "auxiliary_loss_mlp": 0.01046863, "balance_loss_clip": 1.04512811, "balance_loss_mlp": 1.02796841, "epoch": 0.3375920637306478, "flos": 23148916398720.0, "grad_norm": 1.967540834348034, "language_loss": 0.91201901, "learning_rate": 3.086919815013031e-06, "loss": 0.93354666, "num_input_tokens_seen": 120521125, "step": 5615, "time_per_iteration": 4.486853361129761 }, { "auxiliary_loss_clip": 0.01119294, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.04542458, "balance_loss_mlp": 1.0265168, "epoch": 0.3376521869833158, "flos": 23112215677440.0, "grad_norm": 2.688104519924193, "language_loss": 0.80865037, "learning_rate": 3.086592866591809e-06, "loss": 0.83026439, "num_input_tokens_seen": 120539180, "step": 5616, "time_per_iteration": 2.693419933319092 }, { "auxiliary_loss_clip": 0.01132102, "auxiliary_loss_mlp": 0.00776249, "balance_loss_clip": 1.04987526, "balance_loss_mlp": 1.00074387, "epoch": 0.33771231023598375, "flos": 19274585581440.0, "grad_norm": 5.641479508637021, "language_loss": 0.83967853, "learning_rate": 3.0862658769668774e-06, "loss": 0.85876203, "num_input_tokens_seen": 120556280, "step": 5617, "time_per_iteration": 4.261611461639404 }, { "auxiliary_loss_clip": 0.01065047, "auxiliary_loss_mlp": 0.01048039, "balance_loss_clip": 1.0423851, "balance_loss_mlp": 1.030074, "epoch": 0.3377724334886517, "flos": 18150187167360.0, "grad_norm": 2.2609860925126117, "language_loss": 0.80159199, "learning_rate": 3.0859388461506343e-06, "loss": 0.82272285, "num_input_tokens_seen": 120575395, "step": 5618, "time_per_iteration": 2.8115389347076416 }, { "auxiliary_loss_clip": 0.01092947, "auxiliary_loss_mlp": 0.01037796, "balance_loss_clip": 1.04605365, "balance_loss_mlp": 1.02121353, "epoch": 0.3378325567413197, "flos": 25775997310080.0, "grad_norm": 1.9598490702889584, "language_loss": 0.7111814, "learning_rate": 3.085611774155481e-06, "loss": 0.73248887, "num_input_tokens_seen": 120596075, "step": 5619, "time_per_iteration": 2.86958909034729 }, { "auxiliary_loss_clip": 0.01116213, "auxiliary_loss_mlp": 0.01047745, "balance_loss_clip": 1.04749656, "balance_loss_mlp": 1.03167593, "epoch": 0.3378926799939877, "flos": 21317112558720.0, "grad_norm": 2.630730252639156, "language_loss": 0.70144761, "learning_rate": 3.085284660993821e-06, "loss": 0.72308713, "num_input_tokens_seen": 120614195, "step": 5620, "time_per_iteration": 2.6953368186950684 }, { "auxiliary_loss_clip": 0.01136416, "auxiliary_loss_mlp": 0.01047216, "balance_loss_clip": 1.05076015, "balance_loss_mlp": 1.03201699, "epoch": 0.33795280324665566, "flos": 24900028335360.0, "grad_norm": 1.8373178803043773, "language_loss": 0.67899036, "learning_rate": 3.084957506678058e-06, "loss": 0.70082676, "num_input_tokens_seen": 120634475, "step": 5621, "time_per_iteration": 2.6531872749328613 }, { "auxiliary_loss_clip": 0.0110792, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.04716897, "balance_loss_mlp": 1.02814865, "epoch": 0.33801292649932363, "flos": 24753943722240.0, "grad_norm": 1.7693089540657438, "language_loss": 0.82862681, "learning_rate": 3.0846303112205975e-06, "loss": 0.85015041, "num_input_tokens_seen": 120654980, "step": 5622, "time_per_iteration": 2.7764267921447754 }, { "auxiliary_loss_clip": 0.01097036, "auxiliary_loss_mlp": 0.01041227, "balance_loss_clip": 1.043239, "balance_loss_mlp": 1.02565813, "epoch": 0.3380730497519916, "flos": 26723967096960.0, "grad_norm": 7.015051283901371, "language_loss": 0.73815429, "learning_rate": 3.0843030746338464e-06, "loss": 0.75953692, "num_input_tokens_seen": 120676245, "step": 5623, "time_per_iteration": 2.7962961196899414 }, { "auxiliary_loss_clip": 0.0104645, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.03514934, "balance_loss_mlp": 1.0298605, "epoch": 0.33813317300465956, "flos": 70035756416640.0, "grad_norm": 0.757644747116446, "language_loss": 0.55002284, "learning_rate": 3.083975796930215e-06, "loss": 0.57080543, "num_input_tokens_seen": 120741965, "step": 5624, "time_per_iteration": 3.3495559692382812 }, { "auxiliary_loss_clip": 0.01091887, "auxiliary_loss_mlp": 0.01055525, "balance_loss_clip": 1.04508519, "balance_loss_mlp": 1.03704786, "epoch": 0.3381932962573275, "flos": 24097317148800.0, "grad_norm": 3.1490866232839876, "language_loss": 0.73299229, "learning_rate": 3.083648478122111e-06, "loss": 0.75446641, "num_input_tokens_seen": 120760410, "step": 5625, "time_per_iteration": 2.7474253177642822 }, { "auxiliary_loss_clip": 0.01127839, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.04838002, "balance_loss_mlp": 1.02828884, "epoch": 0.3382534195099955, "flos": 19278248768640.0, "grad_norm": 5.828984180477566, "language_loss": 0.70578009, "learning_rate": 3.0833211182219497e-06, "loss": 0.72751105, "num_input_tokens_seen": 120777705, "step": 5626, "time_per_iteration": 2.6597115993499756 }, { "auxiliary_loss_clip": 0.01108172, "auxiliary_loss_mlp": 0.01041744, "balance_loss_clip": 1.04509664, "balance_loss_mlp": 1.02605569, "epoch": 0.33831354276266346, "flos": 25226240676480.0, "grad_norm": 3.2927176036830574, "language_loss": 0.80853224, "learning_rate": 3.0829937172421425e-06, "loss": 0.83003139, "num_input_tokens_seen": 120798660, "step": 5627, "time_per_iteration": 2.730774402618408 }, { "auxiliary_loss_clip": 0.01131612, "auxiliary_loss_mlp": 0.0077564, "balance_loss_clip": 1.05286694, "balance_loss_mlp": 1.00064421, "epoch": 0.3383736660153314, "flos": 23112000195840.0, "grad_norm": 2.306116347111899, "language_loss": 0.80454439, "learning_rate": 3.0826662751951055e-06, "loss": 0.82361686, "num_input_tokens_seen": 120816705, "step": 5628, "time_per_iteration": 2.691471576690674 }, { "auxiliary_loss_clip": 0.01080566, "auxiliary_loss_mlp": 0.01046147, "balance_loss_clip": 1.04250276, "balance_loss_mlp": 1.02787185, "epoch": 0.3384337892679994, "flos": 23477139901440.0, "grad_norm": 3.64262689820424, "language_loss": 0.77174091, "learning_rate": 3.082338792093254e-06, "loss": 0.79300809, "num_input_tokens_seen": 120835375, "step": 5629, "time_per_iteration": 2.7564992904663086 }, { "auxiliary_loss_clip": 0.01116368, "auxiliary_loss_mlp": 0.01046104, "balance_loss_clip": 1.04699719, "balance_loss_mlp": 1.02819836, "epoch": 0.33849391252066735, "flos": 19425805839360.0, "grad_norm": 4.669184863549949, "language_loss": 0.84738326, "learning_rate": 3.0820112679490074e-06, "loss": 0.86900795, "num_input_tokens_seen": 120854260, "step": 5630, "time_per_iteration": 2.7284910678863525 }, { "auxiliary_loss_clip": 0.0108732, "auxiliary_loss_mlp": 0.01055965, "balance_loss_clip": 1.04692125, "balance_loss_mlp": 1.03889382, "epoch": 0.3385540357733353, "flos": 21064840364160.0, "grad_norm": 2.0951078731071204, "language_loss": 0.71627271, "learning_rate": 3.0816837027747857e-06, "loss": 0.73770559, "num_input_tokens_seen": 120871590, "step": 5631, "time_per_iteration": 2.7423501014709473 }, { "auxiliary_loss_clip": 0.01036653, "auxiliary_loss_mlp": 0.01008716, "balance_loss_clip": 1.02691352, "balance_loss_mlp": 1.00683236, "epoch": 0.3386141590260033, "flos": 69208013450880.0, "grad_norm": 0.8383263502294551, "language_loss": 0.56103444, "learning_rate": 3.0813560965830084e-06, "loss": 0.58148813, "num_input_tokens_seen": 120925550, "step": 5632, "time_per_iteration": 3.24780535697937 }, { "auxiliary_loss_clip": 0.01122742, "auxiliary_loss_mlp": 0.01038822, "balance_loss_clip": 1.05064476, "balance_loss_mlp": 1.02198935, "epoch": 0.3386742822786713, "flos": 25519487310720.0, "grad_norm": 1.5341010429525646, "language_loss": 0.80410492, "learning_rate": 3.0810284493861005e-06, "loss": 0.82572055, "num_input_tokens_seen": 120947620, "step": 5633, "time_per_iteration": 2.6492738723754883 }, { "auxiliary_loss_clip": 0.01099799, "auxiliary_loss_mlp": 0.01044702, "balance_loss_clip": 1.04435778, "balance_loss_mlp": 1.02854943, "epoch": 0.33873440553133927, "flos": 23623116773760.0, "grad_norm": 2.1401050060877997, "language_loss": 0.59013391, "learning_rate": 3.0807007611964855e-06, "loss": 0.61157894, "num_input_tokens_seen": 120965205, "step": 5634, "time_per_iteration": 2.7261369228363037 }, { "auxiliary_loss_clip": 0.01106157, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.04877985, "balance_loss_mlp": 1.02482784, "epoch": 0.33879452878400723, "flos": 17088882992640.0, "grad_norm": 1.8243057386875807, "language_loss": 0.92440355, "learning_rate": 3.080373032026589e-06, "loss": 0.94587028, "num_input_tokens_seen": 120983560, "step": 5635, "time_per_iteration": 2.627788782119751 }, { "auxiliary_loss_clip": 0.01091476, "auxiliary_loss_mlp": 0.01039192, "balance_loss_clip": 1.05005646, "balance_loss_mlp": 1.02288401, "epoch": 0.3388546520366752, "flos": 15742053607680.0, "grad_norm": 2.00681285666687, "language_loss": 0.75539577, "learning_rate": 3.0800452618888386e-06, "loss": 0.7767024, "num_input_tokens_seen": 121001400, "step": 5636, "time_per_iteration": 2.706772565841675 }, { "auxiliary_loss_clip": 0.0112617, "auxiliary_loss_mlp": 0.01044921, "balance_loss_clip": 1.05089188, "balance_loss_mlp": 1.02866137, "epoch": 0.33891477528934316, "flos": 22418744728320.0, "grad_norm": 1.7127540900641318, "language_loss": 0.83448696, "learning_rate": 3.0797174507956637e-06, "loss": 0.85619783, "num_input_tokens_seen": 121021760, "step": 5637, "time_per_iteration": 2.6864166259765625 }, { "auxiliary_loss_clip": 0.0109052, "auxiliary_loss_mlp": 0.01051499, "balance_loss_clip": 1.04899251, "balance_loss_mlp": 1.03193665, "epoch": 0.3389748985420111, "flos": 17274828723840.0, "grad_norm": 1.650296659926583, "language_loss": 0.70123053, "learning_rate": 3.079389598759495e-06, "loss": 0.72265071, "num_input_tokens_seen": 121041070, "step": 5638, "time_per_iteration": 2.7513418197631836 }, { "auxiliary_loss_clip": 0.01107421, "auxiliary_loss_mlp": 0.01049541, "balance_loss_clip": 1.0486834, "balance_loss_mlp": 1.0325892, "epoch": 0.3390350217946791, "flos": 27744979190400.0, "grad_norm": 3.471125425253904, "language_loss": 0.80819786, "learning_rate": 3.079061705792765e-06, "loss": 0.82976747, "num_input_tokens_seen": 121060890, "step": 5639, "time_per_iteration": 2.8025810718536377 }, { "auxiliary_loss_clip": 0.01143398, "auxiliary_loss_mlp": 0.01048836, "balance_loss_clip": 1.0533762, "balance_loss_mlp": 1.03158689, "epoch": 0.33909514504734706, "flos": 20339804338560.0, "grad_norm": 8.162571098362656, "language_loss": 0.67619336, "learning_rate": 3.078733771907907e-06, "loss": 0.69811565, "num_input_tokens_seen": 121079135, "step": 5640, "time_per_iteration": 2.662127733230591 }, { "auxiliary_loss_clip": 0.01114186, "auxiliary_loss_mlp": 0.01038526, "balance_loss_clip": 1.04930854, "balance_loss_mlp": 1.02196789, "epoch": 0.339155268300015, "flos": 14830030356480.0, "grad_norm": 1.6687164879604648, "language_loss": 0.69589841, "learning_rate": 3.0784057971173554e-06, "loss": 0.71742553, "num_input_tokens_seen": 121097685, "step": 5641, "time_per_iteration": 2.6596109867095947 }, { "auxiliary_loss_clip": 0.01142481, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.05451512, "balance_loss_mlp": 1.02698565, "epoch": 0.339215391552683, "flos": 26067951054720.0, "grad_norm": 2.4357287647671266, "language_loss": 0.87591994, "learning_rate": 3.0780777814335483e-06, "loss": 0.89777428, "num_input_tokens_seen": 121115640, "step": 5642, "time_per_iteration": 2.6347198486328125 }, { "auxiliary_loss_clip": 0.01117312, "auxiliary_loss_mlp": 0.01034931, "balance_loss_clip": 1.04759669, "balance_loss_mlp": 1.02112639, "epoch": 0.33927551480535095, "flos": 14574705505920.0, "grad_norm": 1.860184080586481, "language_loss": 0.83900917, "learning_rate": 3.077749724868924e-06, "loss": 0.86053157, "num_input_tokens_seen": 121132485, "step": 5643, "time_per_iteration": 2.678086042404175 }, { "auxiliary_loss_clip": 0.01107188, "auxiliary_loss_mlp": 0.01049417, "balance_loss_clip": 1.04616475, "balance_loss_mlp": 1.03295422, "epoch": 0.3393356380580189, "flos": 23805578885760.0, "grad_norm": 4.293096130940915, "language_loss": 0.76897138, "learning_rate": 3.077421627435922e-06, "loss": 0.79053748, "num_input_tokens_seen": 121152935, "step": 5644, "time_per_iteration": 2.6681976318359375 }, { "auxiliary_loss_clip": 0.01123, "auxiliary_loss_mlp": 0.01046638, "balance_loss_clip": 1.05055666, "balance_loss_mlp": 1.02978194, "epoch": 0.3393957613106869, "flos": 17347871030400.0, "grad_norm": 8.889141309374795, "language_loss": 0.62855232, "learning_rate": 3.0770934891469832e-06, "loss": 0.65024871, "num_input_tokens_seen": 121169835, "step": 5645, "time_per_iteration": 2.5976576805114746 }, { "auxiliary_loss_clip": 0.01123901, "auxiliary_loss_mlp": 0.01042398, "balance_loss_clip": 1.04963613, "balance_loss_mlp": 1.0272944, "epoch": 0.3394558845633549, "flos": 28433960939520.0, "grad_norm": 1.8158202042065192, "language_loss": 0.76223624, "learning_rate": 3.076765310014552e-06, "loss": 0.78389925, "num_input_tokens_seen": 121190290, "step": 5646, "time_per_iteration": 2.674058437347412 }, { "auxiliary_loss_clip": 0.01128511, "auxiliary_loss_mlp": 0.01049927, "balance_loss_clip": 1.05314088, "balance_loss_mlp": 1.03245091, "epoch": 0.33951600781602287, "flos": 22086929865600.0, "grad_norm": 2.6597837481337256, "language_loss": 0.78888249, "learning_rate": 3.0764370900510727e-06, "loss": 0.81066692, "num_input_tokens_seen": 121209060, "step": 5647, "time_per_iteration": 2.636462688446045 }, { "auxiliary_loss_clip": 0.01113432, "auxiliary_loss_mlp": 0.0077397, "balance_loss_clip": 1.05254745, "balance_loss_mlp": 1.00053275, "epoch": 0.33957613106869083, "flos": 23878262056320.0, "grad_norm": 2.0563114900155037, "language_loss": 0.77694631, "learning_rate": 3.0761088292689904e-06, "loss": 0.7958203, "num_input_tokens_seen": 121227480, "step": 5648, "time_per_iteration": 2.704535484313965 }, { "auxiliary_loss_clip": 0.00999132, "auxiliary_loss_mlp": 0.01023587, "balance_loss_clip": 1.03748918, "balance_loss_mlp": 1.02168012, "epoch": 0.3396362543213588, "flos": 71242642414080.0, "grad_norm": 0.7822172669689142, "language_loss": 0.56281364, "learning_rate": 3.075780527680754e-06, "loss": 0.58304083, "num_input_tokens_seen": 121291305, "step": 5649, "time_per_iteration": 3.6428561210632324 }, { "auxiliary_loss_clip": 0.01109513, "auxiliary_loss_mlp": 0.00776659, "balance_loss_clip": 1.04886901, "balance_loss_mlp": 1.00053644, "epoch": 0.33969637757402676, "flos": 25921615046400.0, "grad_norm": 1.4990429944851429, "language_loss": 0.85522908, "learning_rate": 3.0754521852988117e-06, "loss": 0.87409085, "num_input_tokens_seen": 121312740, "step": 5650, "time_per_iteration": 4.6250996589660645 }, { "auxiliary_loss_clip": 0.01125063, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.04845572, "balance_loss_mlp": 1.01392674, "epoch": 0.33975650082669473, "flos": 35261728663680.0, "grad_norm": 1.7009103293103713, "language_loss": 0.70462626, "learning_rate": 3.0751238021356152e-06, "loss": 0.7261681, "num_input_tokens_seen": 121334220, "step": 5651, "time_per_iteration": 3.0873425006866455 }, { "auxiliary_loss_clip": 0.01088353, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.04718101, "balance_loss_mlp": 1.02539587, "epoch": 0.3398166240793627, "flos": 16647001879680.0, "grad_norm": 2.657059560006321, "language_loss": 0.80932343, "learning_rate": 3.074795378203616e-06, "loss": 0.83062148, "num_input_tokens_seen": 121351870, "step": 5652, "time_per_iteration": 2.957105875015259 }, { "auxiliary_loss_clip": 0.01143187, "auxiliary_loss_mlp": 0.0104477, "balance_loss_clip": 1.05543184, "balance_loss_mlp": 1.0275445, "epoch": 0.33987674733203066, "flos": 24062196625920.0, "grad_norm": 2.181969038816262, "language_loss": 0.76847494, "learning_rate": 3.0744669135152685e-06, "loss": 0.79035449, "num_input_tokens_seen": 121373400, "step": 5653, "time_per_iteration": 4.277743816375732 }, { "auxiliary_loss_clip": 0.01117346, "auxiliary_loss_mlp": 0.01041107, "balance_loss_clip": 1.04708898, "balance_loss_mlp": 1.02475142, "epoch": 0.3399368705846986, "flos": 13250678279040.0, "grad_norm": 2.9108557214850217, "language_loss": 0.85412633, "learning_rate": 3.0741384080830278e-06, "loss": 0.8757109, "num_input_tokens_seen": 121385225, "step": 5654, "time_per_iteration": 4.243285179138184 }, { "auxiliary_loss_clip": 0.01118111, "auxiliary_loss_mlp": 0.01041226, "balance_loss_clip": 1.04521537, "balance_loss_mlp": 1.02490664, "epoch": 0.3399969938373666, "flos": 27012832272000.0, "grad_norm": 5.5024852924346765, "language_loss": 0.64919531, "learning_rate": 3.073809861919351e-06, "loss": 0.67078876, "num_input_tokens_seen": 121404735, "step": 5655, "time_per_iteration": 2.793121576309204 }, { "auxiliary_loss_clip": 0.01129599, "auxiliary_loss_mlp": 0.01043607, "balance_loss_clip": 1.05404055, "balance_loss_mlp": 1.02828872, "epoch": 0.34005711709003456, "flos": 28550096588160.0, "grad_norm": 1.7231624830718477, "language_loss": 0.7624622, "learning_rate": 3.073481275036697e-06, "loss": 0.78419423, "num_input_tokens_seen": 121426780, "step": 5656, "time_per_iteration": 2.739227056503296 }, { "auxiliary_loss_clip": 0.01102847, "auxiliary_loss_mlp": 0.01040319, "balance_loss_clip": 1.0458467, "balance_loss_mlp": 1.02364159, "epoch": 0.3401172403427025, "flos": 21617003208960.0, "grad_norm": 8.964185236965056, "language_loss": 0.82842731, "learning_rate": 3.073152647447525e-06, "loss": 0.849859, "num_input_tokens_seen": 121447245, "step": 5657, "time_per_iteration": 5.179774761199951 }, { "auxiliary_loss_clip": 0.01113742, "auxiliary_loss_mlp": 0.01048481, "balance_loss_clip": 1.05169284, "balance_loss_mlp": 1.03313899, "epoch": 0.3401773635953705, "flos": 25885776251520.0, "grad_norm": 1.8385093437954252, "language_loss": 0.85050905, "learning_rate": 3.0728239791642976e-06, "loss": 0.87213123, "num_input_tokens_seen": 121468165, "step": 5658, "time_per_iteration": 2.776137351989746 }, { "auxiliary_loss_clip": 0.01053106, "auxiliary_loss_mlp": 0.01016184, "balance_loss_clip": 1.03449082, "balance_loss_mlp": 1.01424086, "epoch": 0.3402374868480385, "flos": 65507995336320.0, "grad_norm": 0.825209949556337, "language_loss": 0.59988189, "learning_rate": 3.072495270199477e-06, "loss": 0.62057471, "num_input_tokens_seen": 121523795, "step": 5659, "time_per_iteration": 3.272684335708618 }, { "auxiliary_loss_clip": 0.01137862, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.05531621, "balance_loss_mlp": 1.02102888, "epoch": 0.34029761010070647, "flos": 24060580513920.0, "grad_norm": 2.521681543348545, "language_loss": 0.67763948, "learning_rate": 3.0721665205655284e-06, "loss": 0.69937897, "num_input_tokens_seen": 121542950, "step": 5660, "time_per_iteration": 2.699267864227295 }, { "auxiliary_loss_clip": 0.01142235, "auxiliary_loss_mlp": 0.010443, "balance_loss_clip": 1.05695057, "balance_loss_mlp": 1.02787328, "epoch": 0.34035773335337444, "flos": 27599720590080.0, "grad_norm": 1.9299535220965447, "language_loss": 0.67668259, "learning_rate": 3.071837730274918e-06, "loss": 0.69854796, "num_input_tokens_seen": 121562765, "step": 5661, "time_per_iteration": 2.647101402282715 }, { "auxiliary_loss_clip": 0.01119112, "auxiliary_loss_mlp": 0.01041902, "balance_loss_clip": 1.05479288, "balance_loss_mlp": 1.02634561, "epoch": 0.3404178566060424, "flos": 20812783651200.0, "grad_norm": 2.0521689983251954, "language_loss": 0.78806192, "learning_rate": 3.071508899340113e-06, "loss": 0.80967206, "num_input_tokens_seen": 121581610, "step": 5662, "time_per_iteration": 2.847168207168579 }, { "auxiliary_loss_clip": 0.01103563, "auxiliary_loss_mlp": 0.01041962, "balance_loss_clip": 1.05163002, "balance_loss_mlp": 1.02498698, "epoch": 0.34047797985871037, "flos": 26833566470400.0, "grad_norm": 2.226848836482441, "language_loss": 0.73531127, "learning_rate": 3.0711800277735833e-06, "loss": 0.75676656, "num_input_tokens_seen": 121601885, "step": 5663, "time_per_iteration": 2.8581340312957764 }, { "auxiliary_loss_clip": 0.01090462, "auxiliary_loss_mlp": 0.01035271, "balance_loss_clip": 1.04631042, "balance_loss_mlp": 1.02079868, "epoch": 0.34053810311137833, "flos": 19682639061120.0, "grad_norm": 1.7108226041633658, "language_loss": 0.86297357, "learning_rate": 3.0708511155877997e-06, "loss": 0.88423085, "num_input_tokens_seen": 121621335, "step": 5664, "time_per_iteration": 2.778038501739502 }, { "auxiliary_loss_clip": 0.01139377, "auxiliary_loss_mlp": 0.0103938, "balance_loss_clip": 1.05399597, "balance_loss_mlp": 1.0245564, "epoch": 0.3405982263640463, "flos": 21725740656000.0, "grad_norm": 2.2398696420560675, "language_loss": 0.68712831, "learning_rate": 3.070522162795235e-06, "loss": 0.70891583, "num_input_tokens_seen": 121641310, "step": 5665, "time_per_iteration": 2.688643217086792 }, { "auxiliary_loss_clip": 0.01138662, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.05278993, "balance_loss_mlp": 1.0229218, "epoch": 0.34065834961671426, "flos": 18041629288320.0, "grad_norm": 2.716291820837314, "language_loss": 0.73084486, "learning_rate": 3.0701931694083626e-06, "loss": 0.7526291, "num_input_tokens_seen": 121659625, "step": 5666, "time_per_iteration": 2.7325544357299805 }, { "auxiliary_loss_clip": 0.01128915, "auxiliary_loss_mlp": 0.01039671, "balance_loss_clip": 1.05135012, "balance_loss_mlp": 1.0244832, "epoch": 0.3407184728693822, "flos": 21397337585280.0, "grad_norm": 2.363121461769924, "language_loss": 0.72947341, "learning_rate": 3.0698641354396576e-06, "loss": 0.75115931, "num_input_tokens_seen": 121679205, "step": 5667, "time_per_iteration": 2.7143874168395996 }, { "auxiliary_loss_clip": 0.01042137, "auxiliary_loss_mlp": 0.01008076, "balance_loss_clip": 1.02401757, "balance_loss_mlp": 1.00638342, "epoch": 0.3407785961220502, "flos": 68688101018880.0, "grad_norm": 0.8313790259289849, "language_loss": 0.63259363, "learning_rate": 3.069535060901597e-06, "loss": 0.65309572, "num_input_tokens_seen": 121751085, "step": 5668, "time_per_iteration": 3.3907217979431152 }, { "auxiliary_loss_clip": 0.01036989, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.03961444, "balance_loss_mlp": 1.02808475, "epoch": 0.34083871937471816, "flos": 14064379027200.0, "grad_norm": 2.2447075161594365, "language_loss": 0.71795446, "learning_rate": 3.0692059458066596e-06, "loss": 0.73877549, "num_input_tokens_seen": 121768565, "step": 5669, "time_per_iteration": 2.941349983215332 }, { "auxiliary_loss_clip": 0.0110323, "auxiliary_loss_mlp": 0.00773367, "balance_loss_clip": 1.04966998, "balance_loss_mlp": 1.00054646, "epoch": 0.3408988426273861, "flos": 17085435287040.0, "grad_norm": 1.973306725053756, "language_loss": 0.80678529, "learning_rate": 3.0688767901673265e-06, "loss": 0.82555127, "num_input_tokens_seen": 121784925, "step": 5670, "time_per_iteration": 2.8877930641174316 }, { "auxiliary_loss_clip": 0.01088488, "auxiliary_loss_mlp": 0.01037182, "balance_loss_clip": 1.04484558, "balance_loss_mlp": 1.02111244, "epoch": 0.3409589658800541, "flos": 24024562151040.0, "grad_norm": 1.926244069219147, "language_loss": 0.77521646, "learning_rate": 3.068547593996078e-06, "loss": 0.79647315, "num_input_tokens_seen": 121804425, "step": 5671, "time_per_iteration": 2.886425256729126 }, { "auxiliary_loss_clip": 0.01138739, "auxiliary_loss_mlp": 0.0077388, "balance_loss_clip": 1.05301285, "balance_loss_mlp": 1.00052333, "epoch": 0.34101908913272205, "flos": 21142012734720.0, "grad_norm": 3.7152219569219427, "language_loss": 0.74220848, "learning_rate": 3.0682183573053974e-06, "loss": 0.76133466, "num_input_tokens_seen": 121825145, "step": 5672, "time_per_iteration": 2.751692056655884 }, { "auxiliary_loss_clip": 0.01121109, "auxiliary_loss_mlp": 0.01047405, "balance_loss_clip": 1.04886246, "balance_loss_mlp": 1.03089476, "epoch": 0.3410792123853901, "flos": 15702012921600.0, "grad_norm": 1.8011032028958165, "language_loss": 0.73721337, "learning_rate": 3.06788908010777e-06, "loss": 0.7588985, "num_input_tokens_seen": 121842185, "step": 5673, "time_per_iteration": 2.6628050804138184 }, { "auxiliary_loss_clip": 0.01126244, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.05143654, "balance_loss_mlp": 1.02362132, "epoch": 0.34113933563805804, "flos": 23036012974080.0, "grad_norm": 1.7591090628800392, "language_loss": 0.79972708, "learning_rate": 3.067559762415682e-06, "loss": 0.8213793, "num_input_tokens_seen": 121862260, "step": 5674, "time_per_iteration": 2.6803476810455322 }, { "auxiliary_loss_clip": 0.01054856, "auxiliary_loss_mlp": 0.01001466, "balance_loss_clip": 1.0258925, "balance_loss_mlp": 0.9994635, "epoch": 0.341199458890726, "flos": 69614235336960.0, "grad_norm": 0.7875282266281167, "language_loss": 0.56080592, "learning_rate": 3.0672304042416198e-06, "loss": 0.5813691, "num_input_tokens_seen": 121923560, "step": 5675, "time_per_iteration": 3.3068313598632812 }, { "auxiliary_loss_clip": 0.01115956, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.052145, "balance_loss_mlp": 1.0006851, "epoch": 0.34125958214339397, "flos": 22346348866560.0, "grad_norm": 1.6444328441844458, "language_loss": 0.78795338, "learning_rate": 3.0669010055980734e-06, "loss": 0.80684733, "num_input_tokens_seen": 121943515, "step": 5676, "time_per_iteration": 2.7983739376068115 }, { "auxiliary_loss_clip": 0.01120251, "auxiliary_loss_mlp": 0.01036846, "balance_loss_clip": 1.04593658, "balance_loss_mlp": 1.02024043, "epoch": 0.34131970539606193, "flos": 21871933009920.0, "grad_norm": 1.8897537275348075, "language_loss": 0.85468972, "learning_rate": 3.0665715664975357e-06, "loss": 0.8762607, "num_input_tokens_seen": 121962540, "step": 5677, "time_per_iteration": 2.698751449584961 }, { "auxiliary_loss_clip": 0.01109896, "auxiliary_loss_mlp": 0.01042182, "balance_loss_clip": 1.04772925, "balance_loss_mlp": 1.02586842, "epoch": 0.3413798286487299, "flos": 24935723475840.0, "grad_norm": 1.7514589696636707, "language_loss": 0.79352021, "learning_rate": 3.0662420869524966e-06, "loss": 0.81504107, "num_input_tokens_seen": 121979830, "step": 5678, "time_per_iteration": 2.731834650039673 }, { "auxiliary_loss_clip": 0.01123477, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.04799783, "balance_loss_mlp": 1.01833677, "epoch": 0.34143995190139786, "flos": 25374372364800.0, "grad_norm": 1.8765190883227818, "language_loss": 0.74821675, "learning_rate": 3.0659125669754506e-06, "loss": 0.76978606, "num_input_tokens_seen": 121999055, "step": 5679, "time_per_iteration": 2.7362489700317383 }, { "auxiliary_loss_clip": 0.01044772, "auxiliary_loss_mlp": 0.01004164, "balance_loss_clip": 1.02617037, "balance_loss_mlp": 1.00210214, "epoch": 0.34150007515406583, "flos": 67782578129280.0, "grad_norm": 0.716476818724812, "language_loss": 0.59445524, "learning_rate": 3.0655830065788923e-06, "loss": 0.61494464, "num_input_tokens_seen": 122067015, "step": 5680, "time_per_iteration": 3.241750955581665 }, { "auxiliary_loss_clip": 0.01108333, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.04563892, "balance_loss_mlp": 1.01804543, "epoch": 0.3415601984067338, "flos": 20302421258880.0, "grad_norm": 1.760771174406363, "language_loss": 0.72054088, "learning_rate": 3.0652534057753206e-06, "loss": 0.74195278, "num_input_tokens_seen": 122085295, "step": 5681, "time_per_iteration": 2.7306556701660156 }, { "auxiliary_loss_clip": 0.01109003, "auxiliary_loss_mlp": 0.0104301, "balance_loss_clip": 1.0462265, "balance_loss_mlp": 1.02786994, "epoch": 0.34162032165940176, "flos": 26031178506240.0, "grad_norm": 2.2327180896030443, "language_loss": 0.71463466, "learning_rate": 3.064923764577233e-06, "loss": 0.73615474, "num_input_tokens_seen": 122104020, "step": 5682, "time_per_iteration": 2.825296640396118 }, { "auxiliary_loss_clip": 0.01132395, "auxiliary_loss_mlp": 0.0104079, "balance_loss_clip": 1.04721618, "balance_loss_mlp": 1.02507806, "epoch": 0.3416804449120697, "flos": 28803338449920.0, "grad_norm": 1.5426603390069147, "language_loss": 0.84101224, "learning_rate": 3.0645940829971295e-06, "loss": 0.86274409, "num_input_tokens_seen": 122125080, "step": 5683, "time_per_iteration": 2.6654412746429443 }, { "auxiliary_loss_clip": 0.01112942, "auxiliary_loss_mlp": 0.01047099, "balance_loss_clip": 1.04768562, "balance_loss_mlp": 1.03113699, "epoch": 0.3417405681647377, "flos": 22601601889920.0, "grad_norm": 4.046428716645244, "language_loss": 0.70964772, "learning_rate": 3.0642643610475116e-06, "loss": 0.73124808, "num_input_tokens_seen": 122146350, "step": 5684, "time_per_iteration": 2.724592924118042 }, { "auxiliary_loss_clip": 0.01132202, "auxiliary_loss_mlp": 0.01038054, "balance_loss_clip": 1.04905093, "balance_loss_mlp": 1.02367699, "epoch": 0.34180069141740566, "flos": 24716237420160.0, "grad_norm": 1.9204482618269598, "language_loss": 0.74832582, "learning_rate": 3.0639345987408823e-06, "loss": 0.77002841, "num_input_tokens_seen": 122168085, "step": 5685, "time_per_iteration": 2.7046890258789062 }, { "auxiliary_loss_clip": 0.01114777, "auxiliary_loss_mlp": 0.0104831, "balance_loss_clip": 1.04522872, "balance_loss_mlp": 1.03261042, "epoch": 0.3418608146700737, "flos": 30518755246080.0, "grad_norm": 1.9200820074556442, "language_loss": 0.70611888, "learning_rate": 3.0636047960897468e-06, "loss": 0.72774971, "num_input_tokens_seen": 122191040, "step": 5686, "time_per_iteration": 2.7390410900115967 }, { "auxiliary_loss_clip": 0.01123208, "auxiliary_loss_mlp": 0.01044107, "balance_loss_clip": 1.04809284, "balance_loss_mlp": 1.02819252, "epoch": 0.34192093792274164, "flos": 15122343237120.0, "grad_norm": 2.0197354521106563, "language_loss": 0.77240539, "learning_rate": 3.06327495310661e-06, "loss": 0.79407853, "num_input_tokens_seen": 122209225, "step": 5687, "time_per_iteration": 2.6381263732910156 }, { "auxiliary_loss_clip": 0.01106353, "auxiliary_loss_mlp": 0.01040255, "balance_loss_clip": 1.04849195, "balance_loss_mlp": 1.02412593, "epoch": 0.3419810611754096, "flos": 13187799521280.0, "grad_norm": 3.7332163528162385, "language_loss": 0.8676976, "learning_rate": 3.062945069803981e-06, "loss": 0.88916373, "num_input_tokens_seen": 122226160, "step": 5688, "time_per_iteration": 2.647320508956909 }, { "auxiliary_loss_clip": 0.01119843, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.04928863, "balance_loss_mlp": 1.0255394, "epoch": 0.34204118442807757, "flos": 19536267139200.0, "grad_norm": 1.870477619822585, "language_loss": 0.79564822, "learning_rate": 3.0626151461943684e-06, "loss": 0.81726807, "num_input_tokens_seen": 122243115, "step": 5689, "time_per_iteration": 4.1660990715026855 }, { "auxiliary_loss_clip": 0.0112576, "auxiliary_loss_mlp": 0.01042306, "balance_loss_clip": 1.04875994, "balance_loss_mlp": 1.02580786, "epoch": 0.34210130768074554, "flos": 15194846839680.0, "grad_norm": 1.7530560995380315, "language_loss": 0.73215616, "learning_rate": 3.0622851822902834e-06, "loss": 0.75383675, "num_input_tokens_seen": 122261105, "step": 5690, "time_per_iteration": 2.699846029281616 }, { "auxiliary_loss_clip": 0.01115188, "auxiliary_loss_mlp": 0.01047594, "balance_loss_clip": 1.04381919, "balance_loss_mlp": 1.03121471, "epoch": 0.3421614309334135, "flos": 24936226266240.0, "grad_norm": 2.1339055209058184, "language_loss": 0.76036334, "learning_rate": 3.061955178104237e-06, "loss": 0.78199112, "num_input_tokens_seen": 122279995, "step": 5691, "time_per_iteration": 2.707598924636841 }, { "auxiliary_loss_clip": 0.01119412, "auxiliary_loss_mlp": 0.01042889, "balance_loss_clip": 1.04769242, "balance_loss_mlp": 1.02878046, "epoch": 0.34222155418608147, "flos": 21908633731200.0, "grad_norm": 1.9419180569645556, "language_loss": 0.68321705, "learning_rate": 3.0616251336487447e-06, "loss": 0.70484006, "num_input_tokens_seen": 122299070, "step": 5692, "time_per_iteration": 2.6876816749572754 }, { "auxiliary_loss_clip": 0.01123804, "auxiliary_loss_mlp": 0.01042902, "balance_loss_clip": 1.0481621, "balance_loss_mlp": 1.02660608, "epoch": 0.34228167743874943, "flos": 18114061063680.0, "grad_norm": 2.8342834288415504, "language_loss": 0.72458065, "learning_rate": 3.06129504893632e-06, "loss": 0.74624765, "num_input_tokens_seen": 122316800, "step": 5693, "time_per_iteration": 5.672837018966675 }, { "auxiliary_loss_clip": 0.01090312, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.0433774, "balance_loss_mlp": 1.02832651, "epoch": 0.3423418006914174, "flos": 21288600138240.0, "grad_norm": 1.9009541760697364, "language_loss": 0.75556326, "learning_rate": 3.0609649239794813e-06, "loss": 0.77690107, "num_input_tokens_seen": 122335275, "step": 5694, "time_per_iteration": 2.713236093521118 }, { "auxiliary_loss_clip": 0.01093804, "auxiliary_loss_mlp": 0.01036832, "balance_loss_clip": 1.04769742, "balance_loss_mlp": 1.02205038, "epoch": 0.34240192394408536, "flos": 19823480288640.0, "grad_norm": 2.1810058063417608, "language_loss": 0.79590774, "learning_rate": 3.060634758790747e-06, "loss": 0.81721413, "num_input_tokens_seen": 122353215, "step": 5695, "time_per_iteration": 2.7206506729125977 }, { "auxiliary_loss_clip": 0.01077977, "auxiliary_loss_mlp": 0.01043311, "balance_loss_clip": 1.04183137, "balance_loss_mlp": 1.02764642, "epoch": 0.3424620471967533, "flos": 24535535074560.0, "grad_norm": 1.8643380844369803, "language_loss": 0.73428202, "learning_rate": 3.060304553382635e-06, "loss": 0.75549489, "num_input_tokens_seen": 122372495, "step": 5696, "time_per_iteration": 4.777001857757568 }, { "auxiliary_loss_clip": 0.01088152, "auxiliary_loss_mlp": 0.01052674, "balance_loss_clip": 1.0424118, "balance_loss_mlp": 1.03569841, "epoch": 0.3425221704494213, "flos": 25848895962240.0, "grad_norm": 5.815439398629578, "language_loss": 0.71460104, "learning_rate": 3.0599743077676685e-06, "loss": 0.73600936, "num_input_tokens_seen": 122394600, "step": 5697, "time_per_iteration": 2.7620668411254883 }, { "auxiliary_loss_clip": 0.01108783, "auxiliary_loss_mlp": 0.01032533, "balance_loss_clip": 1.04925871, "balance_loss_mlp": 1.01740503, "epoch": 0.34258229370208926, "flos": 21540513196800.0, "grad_norm": 2.6993537181180316, "language_loss": 0.82170486, "learning_rate": 3.05964402195837e-06, "loss": 0.84311801, "num_input_tokens_seen": 122414700, "step": 5698, "time_per_iteration": 2.6930580139160156 }, { "auxiliary_loss_clip": 0.01077965, "auxiliary_loss_mlp": 0.01049711, "balance_loss_clip": 1.0451839, "balance_loss_mlp": 1.03073311, "epoch": 0.3426424169547573, "flos": 23652778429440.0, "grad_norm": 2.492082875954734, "language_loss": 0.68941295, "learning_rate": 3.0593136959672645e-06, "loss": 0.71068972, "num_input_tokens_seen": 122432760, "step": 5699, "time_per_iteration": 2.8604705333709717 }, { "auxiliary_loss_clip": 0.01113381, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.05009818, "balance_loss_mlp": 1.02698755, "epoch": 0.34270254020742524, "flos": 24644883052800.0, "grad_norm": 2.4799642493365046, "language_loss": 0.72708368, "learning_rate": 3.058983329806877e-06, "loss": 0.74863935, "num_input_tokens_seen": 122449105, "step": 5700, "time_per_iteration": 2.721219301223755 }, { "auxiliary_loss_clip": 0.01107869, "auxiliary_loss_mlp": 0.01033632, "balance_loss_clip": 1.05173492, "balance_loss_mlp": 1.01942825, "epoch": 0.3427626634600932, "flos": 20996754134400.0, "grad_norm": 1.8907099352771195, "language_loss": 0.81771016, "learning_rate": 3.0586529234897354e-06, "loss": 0.83912516, "num_input_tokens_seen": 122468700, "step": 5701, "time_per_iteration": 2.668776273727417 }, { "auxiliary_loss_clip": 0.01122749, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.05318427, "balance_loss_mlp": 1.02137566, "epoch": 0.3428227867127612, "flos": 21433786911360.0, "grad_norm": 1.8540703451937275, "language_loss": 0.71611702, "learning_rate": 3.0583224770283694e-06, "loss": 0.73770893, "num_input_tokens_seen": 122488160, "step": 5702, "time_per_iteration": 2.7413434982299805 }, { "auxiliary_loss_clip": 0.01034072, "auxiliary_loss_mlp": 0.0102117, "balance_loss_clip": 1.02648544, "balance_loss_mlp": 1.01936996, "epoch": 0.34288290996542914, "flos": 55731782695680.0, "grad_norm": 0.8291151185510042, "language_loss": 0.57455015, "learning_rate": 3.057991990435309e-06, "loss": 0.59510255, "num_input_tokens_seen": 122542890, "step": 5703, "time_per_iteration": 3.123619318008423 }, { "auxiliary_loss_clip": 0.01125899, "auxiliary_loss_mlp": 0.01044546, "balance_loss_clip": 1.05167961, "balance_loss_mlp": 1.02754664, "epoch": 0.3429430332180971, "flos": 20156803522560.0, "grad_norm": 2.054859273280662, "language_loss": 0.75049305, "learning_rate": 3.057661463723086e-06, "loss": 0.77219748, "num_input_tokens_seen": 122561770, "step": 5704, "time_per_iteration": 2.786344051361084 }, { "auxiliary_loss_clip": 0.01103715, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.05234969, "balance_loss_mlp": 1.02506232, "epoch": 0.34300315647076507, "flos": 17965857548160.0, "grad_norm": 1.921400910299184, "language_loss": 0.72367042, "learning_rate": 3.0573308969042346e-06, "loss": 0.74510252, "num_input_tokens_seen": 122580580, "step": 5705, "time_per_iteration": 2.7464826107025146 }, { "auxiliary_loss_clip": 0.01099266, "auxiliary_loss_mlp": 0.01035276, "balance_loss_clip": 1.05201912, "balance_loss_mlp": 1.01980281, "epoch": 0.34306327972343303, "flos": 22086822124800.0, "grad_norm": 2.585473080189318, "language_loss": 0.80016834, "learning_rate": 3.057000289991289e-06, "loss": 0.82151377, "num_input_tokens_seen": 122599810, "step": 5706, "time_per_iteration": 2.83493971824646 }, { "auxiliary_loss_clip": 0.01126183, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.05822873, "balance_loss_mlp": 1.02111542, "epoch": 0.343123402976101, "flos": 18442679616000.0, "grad_norm": 2.833985332828215, "language_loss": 0.83001584, "learning_rate": 3.056669642996787e-06, "loss": 0.85165167, "num_input_tokens_seen": 122616035, "step": 5707, "time_per_iteration": 2.6888725757598877 }, { "auxiliary_loss_clip": 0.01130807, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.05664158, "balance_loss_mlp": 1.02264881, "epoch": 0.34318352622876896, "flos": 17163685065600.0, "grad_norm": 1.6733576562987098, "language_loss": 0.75313264, "learning_rate": 3.056338955933266e-06, "loss": 0.7748242, "num_input_tokens_seen": 122633785, "step": 5708, "time_per_iteration": 2.655061960220337 }, { "auxiliary_loss_clip": 0.01105586, "auxiliary_loss_mlp": 0.01039807, "balance_loss_clip": 1.05063939, "balance_loss_mlp": 1.02357078, "epoch": 0.34324364948143693, "flos": 26688164215680.0, "grad_norm": 1.6008558791331946, "language_loss": 0.81187862, "learning_rate": 3.0560082288132662e-06, "loss": 0.83333254, "num_input_tokens_seen": 122652100, "step": 5709, "time_per_iteration": 2.7354934215545654 }, { "auxiliary_loss_clip": 0.01119071, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.0550828, "balance_loss_mlp": 1.02581382, "epoch": 0.3433037727341049, "flos": 21251576194560.0, "grad_norm": 2.1605529243452297, "language_loss": 0.79441178, "learning_rate": 3.055677461649329e-06, "loss": 0.81603634, "num_input_tokens_seen": 122669720, "step": 5710, "time_per_iteration": 2.757321834564209 }, { "auxiliary_loss_clip": 0.01130524, "auxiliary_loss_mlp": 0.01039861, "balance_loss_clip": 1.05363941, "balance_loss_mlp": 1.02329111, "epoch": 0.34336389598677286, "flos": 20629423699200.0, "grad_norm": 1.8403881586839854, "language_loss": 0.70303786, "learning_rate": 3.055346654453996e-06, "loss": 0.7247417, "num_input_tokens_seen": 122688715, "step": 5711, "time_per_iteration": 2.6535775661468506 }, { "auxiliary_loss_clip": 0.01106817, "auxiliary_loss_mlp": 0.00774858, "balance_loss_clip": 1.05299044, "balance_loss_mlp": 1.00072622, "epoch": 0.3434240192394409, "flos": 14538579402240.0, "grad_norm": 1.8401630077009354, "language_loss": 0.67124939, "learning_rate": 3.055015807239812e-06, "loss": 0.69006616, "num_input_tokens_seen": 122706970, "step": 5712, "time_per_iteration": 2.7115519046783447 }, { "auxiliary_loss_clip": 0.01051163, "auxiliary_loss_mlp": 0.01005713, "balance_loss_clip": 1.0511148, "balance_loss_mlp": 1.00409162, "epoch": 0.34348414249210885, "flos": 58051538841600.0, "grad_norm": 0.846630151399307, "language_loss": 0.58072996, "learning_rate": 3.0546849200193226e-06, "loss": 0.60129869, "num_input_tokens_seen": 122758095, "step": 5713, "time_per_iteration": 3.3988189697265625 }, { "auxiliary_loss_clip": 0.01142007, "auxiliary_loss_mlp": 0.01043862, "balance_loss_clip": 1.05782688, "balance_loss_mlp": 1.02813852, "epoch": 0.3435442657447768, "flos": 20704441253760.0, "grad_norm": 1.6506449407169241, "language_loss": 0.8079257, "learning_rate": 3.054353992805076e-06, "loss": 0.82978439, "num_input_tokens_seen": 122777815, "step": 5714, "time_per_iteration": 2.682537078857422 }, { "auxiliary_loss_clip": 0.01142274, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.0581255, "balance_loss_mlp": 1.02628696, "epoch": 0.3436043889974448, "flos": 22930256355840.0, "grad_norm": 2.1462767477025055, "language_loss": 0.72059911, "learning_rate": 3.05402302560962e-06, "loss": 0.74244434, "num_input_tokens_seen": 122797555, "step": 5715, "time_per_iteration": 2.6535134315490723 }, { "auxiliary_loss_clip": 0.01070037, "auxiliary_loss_mlp": 0.01002865, "balance_loss_clip": 1.0577507, "balance_loss_mlp": 1.00051689, "epoch": 0.34366451225011274, "flos": 58403285752320.0, "grad_norm": 0.9103705044251069, "language_loss": 0.65885556, "learning_rate": 3.053692018445505e-06, "loss": 0.67958462, "num_input_tokens_seen": 122863955, "step": 5716, "time_per_iteration": 3.205113172531128 }, { "auxiliary_loss_clip": 0.01124236, "auxiliary_loss_mlp": 0.0104266, "balance_loss_clip": 1.05416417, "balance_loss_mlp": 1.02718663, "epoch": 0.3437246355027807, "flos": 15596292216960.0, "grad_norm": 2.101112668121384, "language_loss": 0.74272031, "learning_rate": 3.0533609713252838e-06, "loss": 0.76438928, "num_input_tokens_seen": 122883000, "step": 5717, "time_per_iteration": 2.60300350189209 }, { "auxiliary_loss_clip": 0.01084832, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.05195725, "balance_loss_mlp": 1.02437937, "epoch": 0.34378475875544867, "flos": 27672260106240.0, "grad_norm": 1.8405555467441777, "language_loss": 0.75446129, "learning_rate": 3.0530298842615077e-06, "loss": 0.7757023, "num_input_tokens_seen": 122903265, "step": 5718, "time_per_iteration": 2.787687301635742 }, { "auxiliary_loss_clip": 0.01097103, "auxiliary_loss_mlp": 0.01043125, "balance_loss_clip": 1.04837775, "balance_loss_mlp": 1.02739501, "epoch": 0.34384488200811664, "flos": 31431496769280.0, "grad_norm": 1.9369525419747404, "language_loss": 0.63647246, "learning_rate": 3.052698757266734e-06, "loss": 0.65787476, "num_input_tokens_seen": 122923860, "step": 5719, "time_per_iteration": 2.8138949871063232 }, { "auxiliary_loss_clip": 0.01098152, "auxiliary_loss_mlp": 0.01040429, "balance_loss_clip": 1.05234158, "balance_loss_mlp": 1.02310777, "epoch": 0.3439050052607846, "flos": 24899920594560.0, "grad_norm": 1.8182809721987367, "language_loss": 0.73785692, "learning_rate": 3.0523675903535183e-06, "loss": 0.75924277, "num_input_tokens_seen": 122945305, "step": 5720, "time_per_iteration": 2.761371612548828 }, { "auxiliary_loss_clip": 0.01127909, "auxiliary_loss_mlp": 0.01052147, "balance_loss_clip": 1.056463, "balance_loss_mlp": 1.03434944, "epoch": 0.34396512851345257, "flos": 18150079426560.0, "grad_norm": 2.2267988645125896, "language_loss": 0.74087942, "learning_rate": 3.0520363835344173e-06, "loss": 0.76267999, "num_input_tokens_seen": 122962535, "step": 5721, "time_per_iteration": 2.6139280796051025 }, { "auxiliary_loss_clip": 0.0111919, "auxiliary_loss_mlp": 0.0077563, "balance_loss_clip": 1.05647993, "balance_loss_mlp": 1.00063252, "epoch": 0.34402525176612053, "flos": 16034438315520.0, "grad_norm": 2.313932715754647, "language_loss": 0.80464351, "learning_rate": 3.051705136821992e-06, "loss": 0.82359171, "num_input_tokens_seen": 122979750, "step": 5722, "time_per_iteration": 2.6886982917785645 }, { "auxiliary_loss_clip": 0.01092207, "auxiliary_loss_mlp": 0.01038868, "balance_loss_clip": 1.05326557, "balance_loss_mlp": 1.02348995, "epoch": 0.3440853750187885, "flos": 21178641628800.0, "grad_norm": 2.5095280683984984, "language_loss": 0.81647789, "learning_rate": 3.051373850228801e-06, "loss": 0.83778864, "num_input_tokens_seen": 122998955, "step": 5723, "time_per_iteration": 2.7464921474456787 }, { "auxiliary_loss_clip": 0.01099736, "auxiliary_loss_mlp": 0.0105726, "balance_loss_clip": 1.0488528, "balance_loss_mlp": 1.04023743, "epoch": 0.34414549827145646, "flos": 12677868092160.0, "grad_norm": 1.9897062128640133, "language_loss": 0.81431544, "learning_rate": 3.0510425237674096e-06, "loss": 0.83588541, "num_input_tokens_seen": 123016165, "step": 5724, "time_per_iteration": 2.7447471618652344 }, { "auxiliary_loss_clip": 0.01112954, "auxiliary_loss_mlp": 0.01047765, "balance_loss_clip": 1.05231178, "balance_loss_mlp": 1.03056324, "epoch": 0.3442056215241244, "flos": 31284514316160.0, "grad_norm": 1.858960952495153, "language_loss": 0.68913317, "learning_rate": 3.05071115745038e-06, "loss": 0.71074033, "num_input_tokens_seen": 123036900, "step": 5725, "time_per_iteration": 2.798987627029419 }, { "auxiliary_loss_clip": 0.01132971, "auxiliary_loss_mlp": 0.0105182, "balance_loss_clip": 1.05775714, "balance_loss_mlp": 1.03379524, "epoch": 0.34426574477679245, "flos": 23367289132800.0, "grad_norm": 1.4701315954442116, "language_loss": 0.6946882, "learning_rate": 3.0503797512902773e-06, "loss": 0.71653616, "num_input_tokens_seen": 123057480, "step": 5726, "time_per_iteration": 2.663766622543335 }, { "auxiliary_loss_clip": 0.01111868, "auxiliary_loss_mlp": 0.01038496, "balance_loss_clip": 1.05667615, "balance_loss_mlp": 1.02374983, "epoch": 0.3443258680294604, "flos": 24535427333760.0, "grad_norm": 2.4860883718983873, "language_loss": 0.73317868, "learning_rate": 3.0500483052996703e-06, "loss": 0.7546823, "num_input_tokens_seen": 123076890, "step": 5727, "time_per_iteration": 2.8002336025238037 }, { "auxiliary_loss_clip": 0.01097058, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.05053401, "balance_loss_mlp": 1.03590822, "epoch": 0.3443859912821284, "flos": 20230133137920.0, "grad_norm": 2.2067060616784815, "language_loss": 0.88451493, "learning_rate": 3.0497168194911257e-06, "loss": 0.90600753, "num_input_tokens_seen": 123092530, "step": 5728, "time_per_iteration": 2.703842878341675 }, { "auxiliary_loss_clip": 0.01089582, "auxiliary_loss_mlp": 0.01048379, "balance_loss_clip": 1.04858351, "balance_loss_mlp": 1.03266144, "epoch": 0.34444611453479634, "flos": 24316515895680.0, "grad_norm": 2.2135571419735904, "language_loss": 0.70018214, "learning_rate": 3.0493852938772143e-06, "loss": 0.72156173, "num_input_tokens_seen": 123110560, "step": 5729, "time_per_iteration": 4.360877275466919 }, { "auxiliary_loss_clip": 0.01124088, "auxiliary_loss_mlp": 0.01037772, "balance_loss_clip": 1.0525502, "balance_loss_mlp": 1.02208424, "epoch": 0.3445062377874643, "flos": 16983413683200.0, "grad_norm": 1.9483871766944658, "language_loss": 0.7435137, "learning_rate": 3.0490537284705078e-06, "loss": 0.76513231, "num_input_tokens_seen": 123128655, "step": 5730, "time_per_iteration": 2.6021499633789062 }, { "auxiliary_loss_clip": 0.01099617, "auxiliary_loss_mlp": 0.0105823, "balance_loss_clip": 1.04880106, "balance_loss_mlp": 1.04053974, "epoch": 0.3445663610401323, "flos": 20302708567680.0, "grad_norm": 2.1142556114368314, "language_loss": 0.7952323, "learning_rate": 3.048722123283578e-06, "loss": 0.81681079, "num_input_tokens_seen": 123145130, "step": 5731, "time_per_iteration": 4.273399114608765 }, { "auxiliary_loss_clip": 0.01130567, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.05617356, "balance_loss_mlp": 1.02793896, "epoch": 0.34462648429280024, "flos": 15888102307200.0, "grad_norm": 2.0299111477971334, "language_loss": 0.78609502, "learning_rate": 3.0483904783290006e-06, "loss": 0.80783606, "num_input_tokens_seen": 123162265, "step": 5732, "time_per_iteration": 4.672218322753906 }, { "auxiliary_loss_clip": 0.01037769, "auxiliary_loss_mlp": 0.0101237, "balance_loss_clip": 1.03788018, "balance_loss_mlp": 1.0106411, "epoch": 0.3446866075454682, "flos": 59311035285120.0, "grad_norm": 0.7456337544046427, "language_loss": 0.53537595, "learning_rate": 3.0480587936193505e-06, "loss": 0.55587733, "num_input_tokens_seen": 123218620, "step": 5733, "time_per_iteration": 3.322802782058716 }, { "auxiliary_loss_clip": 0.01122514, "auxiliary_loss_mlp": 0.01042066, "balance_loss_clip": 1.05675018, "balance_loss_mlp": 1.02577019, "epoch": 0.34474673079813617, "flos": 22343799000960.0, "grad_norm": 1.936820728476944, "language_loss": 0.832178, "learning_rate": 3.047727069167207e-06, "loss": 0.85382378, "num_input_tokens_seen": 123237325, "step": 5734, "time_per_iteration": 2.7426953315734863 }, { "auxiliary_loss_clip": 0.01120142, "auxiliary_loss_mlp": 0.0103601, "balance_loss_clip": 1.05517805, "balance_loss_mlp": 1.01988125, "epoch": 0.34480685405080413, "flos": 27670141203840.0, "grad_norm": 2.7764640699074077, "language_loss": 0.92655241, "learning_rate": 3.0473953049851478e-06, "loss": 0.94811392, "num_input_tokens_seen": 123258650, "step": 5735, "time_per_iteration": 4.536838054656982 }, { "auxiliary_loss_clip": 0.0110302, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.05774188, "balance_loss_mlp": 1.02492189, "epoch": 0.3448669773034721, "flos": 22456020067200.0, "grad_norm": 1.7508294751665012, "language_loss": 0.76571405, "learning_rate": 3.0470635010857533e-06, "loss": 0.78715694, "num_input_tokens_seen": 123277155, "step": 5736, "time_per_iteration": 2.784958600997925 }, { "auxiliary_loss_clip": 0.01122912, "auxiliary_loss_mlp": 0.0104053, "balance_loss_clip": 1.05683184, "balance_loss_mlp": 1.02396011, "epoch": 0.34492710055614006, "flos": 24936190352640.0, "grad_norm": 1.7983696926456887, "language_loss": 0.78327668, "learning_rate": 3.0467316574816064e-06, "loss": 0.80491114, "num_input_tokens_seen": 123297640, "step": 5737, "time_per_iteration": 2.709786891937256 }, { "auxiliary_loss_clip": 0.01083721, "auxiliary_loss_mlp": 0.0104406, "balance_loss_clip": 1.04379368, "balance_loss_mlp": 1.02520096, "epoch": 0.34498722380880803, "flos": 20120821073280.0, "grad_norm": 2.0055780284948375, "language_loss": 0.71544027, "learning_rate": 3.0463997741852893e-06, "loss": 0.73671806, "num_input_tokens_seen": 123314370, "step": 5738, "time_per_iteration": 2.779651165008545 }, { "auxiliary_loss_clip": 0.0110112, "auxiliary_loss_mlp": 0.01042892, "balance_loss_clip": 1.04991913, "balance_loss_mlp": 1.02520132, "epoch": 0.34504734706147605, "flos": 28438126917120.0, "grad_norm": 2.7751951344870562, "language_loss": 0.82324719, "learning_rate": 3.046067851209389e-06, "loss": 0.84468728, "num_input_tokens_seen": 123336085, "step": 5739, "time_per_iteration": 2.7953522205352783 }, { "auxiliary_loss_clip": 0.01104482, "auxiliary_loss_mlp": 0.01037335, "balance_loss_clip": 1.05071819, "balance_loss_mlp": 1.02132511, "epoch": 0.345107470314144, "flos": 22674464628480.0, "grad_norm": 1.8186717226973075, "language_loss": 0.83071041, "learning_rate": 3.0457358885664898e-06, "loss": 0.85212862, "num_input_tokens_seen": 123354460, "step": 5740, "time_per_iteration": 2.7530486583709717 }, { "auxiliary_loss_clip": 0.01130478, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.05699897, "balance_loss_mlp": 1.01901984, "epoch": 0.345167593566812, "flos": 20630716588800.0, "grad_norm": 2.1971165557092656, "language_loss": 0.7704618, "learning_rate": 3.045403886269181e-06, "loss": 0.79212344, "num_input_tokens_seen": 123373420, "step": 5741, "time_per_iteration": 2.6488983631134033 }, { "auxiliary_loss_clip": 0.01116686, "auxiliary_loss_mlp": 0.01038328, "balance_loss_clip": 1.05202794, "balance_loss_mlp": 1.02271724, "epoch": 0.34522771681947995, "flos": 26214358890240.0, "grad_norm": 1.629760829576741, "language_loss": 0.76972193, "learning_rate": 3.045071844330053e-06, "loss": 0.7912721, "num_input_tokens_seen": 123394730, "step": 5742, "time_per_iteration": 2.7333807945251465 }, { "auxiliary_loss_clip": 0.01133631, "auxiliary_loss_mlp": 0.01040013, "balance_loss_clip": 1.05862427, "balance_loss_mlp": 1.02371693, "epoch": 0.3452878400721479, "flos": 19062354072960.0, "grad_norm": 2.2460068376984523, "language_loss": 0.76135588, "learning_rate": 3.0447397627616955e-06, "loss": 0.78309238, "num_input_tokens_seen": 123412895, "step": 5743, "time_per_iteration": 2.677682638168335 }, { "auxiliary_loss_clip": 0.01128893, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.05570602, "balance_loss_mlp": 1.02171636, "epoch": 0.3453479633248159, "flos": 27929739772800.0, "grad_norm": 2.0501405423310097, "language_loss": 0.70481914, "learning_rate": 3.0444076415767016e-06, "loss": 0.72647989, "num_input_tokens_seen": 123432320, "step": 5744, "time_per_iteration": 2.7430574893951416 }, { "auxiliary_loss_clip": 0.01140382, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.05727339, "balance_loss_mlp": 1.01959133, "epoch": 0.34540808657748384, "flos": 19606113135360.0, "grad_norm": 2.271690731291802, "language_loss": 0.79658759, "learning_rate": 3.044075480787665e-06, "loss": 0.81835419, "num_input_tokens_seen": 123450980, "step": 5745, "time_per_iteration": 2.6587865352630615 }, { "auxiliary_loss_clip": 0.01092128, "auxiliary_loss_mlp": 0.01041398, "balance_loss_clip": 1.0486573, "balance_loss_mlp": 1.02435148, "epoch": 0.3454682098301518, "flos": 20411661496320.0, "grad_norm": 1.8194779915280654, "language_loss": 0.89049339, "learning_rate": 3.043743280407182e-06, "loss": 0.91182864, "num_input_tokens_seen": 123469365, "step": 5746, "time_per_iteration": 2.7314908504486084 }, { "auxiliary_loss_clip": 0.01133638, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.05554819, "balance_loss_mlp": 1.02101421, "epoch": 0.34552833308281977, "flos": 21325121291520.0, "grad_norm": 2.5554958969654136, "language_loss": 0.64851058, "learning_rate": 3.043411040447849e-06, "loss": 0.67023152, "num_input_tokens_seen": 123489425, "step": 5747, "time_per_iteration": 2.6858277320861816 }, { "auxiliary_loss_clip": 0.01119459, "auxiliary_loss_mlp": 0.01035118, "balance_loss_clip": 1.05213308, "balance_loss_mlp": 1.01928735, "epoch": 0.34558845633548774, "flos": 36243633824640.0, "grad_norm": 1.5633023430662023, "language_loss": 0.72855747, "learning_rate": 3.043078760922264e-06, "loss": 0.75010324, "num_input_tokens_seen": 123509970, "step": 5748, "time_per_iteration": 2.805250406265259 }, { "auxiliary_loss_clip": 0.01084714, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.05246413, "balance_loss_mlp": 1.01832819, "epoch": 0.3456485795881557, "flos": 22450561200000.0, "grad_norm": 1.6861475272665256, "language_loss": 0.7584126, "learning_rate": 3.042746441843029e-06, "loss": 0.7795862, "num_input_tokens_seen": 123531055, "step": 5749, "time_per_iteration": 2.8886258602142334 }, { "auxiliary_loss_clip": 0.01061531, "auxiliary_loss_mlp": 0.01002064, "balance_loss_clip": 1.05058503, "balance_loss_mlp": 1.00045478, "epoch": 0.34570870284082367, "flos": 62004299005440.0, "grad_norm": 0.8852783380527953, "language_loss": 0.62715566, "learning_rate": 3.0424140832227437e-06, "loss": 0.64779162, "num_input_tokens_seen": 123584720, "step": 5750, "time_per_iteration": 3.1283066272735596 }, { "auxiliary_loss_clip": 0.01110881, "auxiliary_loss_mlp": 0.01037788, "balance_loss_clip": 1.05210388, "balance_loss_mlp": 1.02242184, "epoch": 0.34576882609349163, "flos": 22782196494720.0, "grad_norm": 2.239830827663745, "language_loss": 0.80332017, "learning_rate": 3.042081685074012e-06, "loss": 0.82480681, "num_input_tokens_seen": 123604465, "step": 5751, "time_per_iteration": 2.721344470977783 }, { "auxiliary_loss_clip": 0.01135561, "auxiliary_loss_mlp": 0.01045926, "balance_loss_clip": 1.0536952, "balance_loss_mlp": 1.03101254, "epoch": 0.34582894934615965, "flos": 12348818576640.0, "grad_norm": 2.3847713847020744, "language_loss": 0.84148252, "learning_rate": 3.041749247409439e-06, "loss": 0.86329746, "num_input_tokens_seen": 123622320, "step": 5752, "time_per_iteration": 2.578984260559082 }, { "auxiliary_loss_clip": 0.01047286, "auxiliary_loss_mlp": 0.00754976, "balance_loss_clip": 1.0380801, "balance_loss_mlp": 1.00148225, "epoch": 0.3458890725988276, "flos": 70167691071360.0, "grad_norm": 0.7284359747550926, "language_loss": 0.6310631, "learning_rate": 3.0414167702416296e-06, "loss": 0.64908576, "num_input_tokens_seen": 123678010, "step": 5753, "time_per_iteration": 3.0907819271087646 }, { "auxiliary_loss_clip": 0.01112695, "auxiliary_loss_mlp": 0.01035981, "balance_loss_clip": 1.05358505, "balance_loss_mlp": 1.01956582, "epoch": 0.3459491958514956, "flos": 17092582093440.0, "grad_norm": 1.9590865283999213, "language_loss": 0.71000856, "learning_rate": 3.0410842535831914e-06, "loss": 0.73149538, "num_input_tokens_seen": 123696830, "step": 5754, "time_per_iteration": 2.7031564712524414 }, { "auxiliary_loss_clip": 0.01127989, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.05300486, "balance_loss_mlp": 1.02251959, "epoch": 0.34600931910416355, "flos": 16650952375680.0, "grad_norm": 2.56305874029915, "language_loss": 0.73286581, "learning_rate": 3.0407516974467343e-06, "loss": 0.75452608, "num_input_tokens_seen": 123714360, "step": 5755, "time_per_iteration": 2.656804084777832 }, { "auxiliary_loss_clip": 0.01122508, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.0504849, "balance_loss_mlp": 1.01791406, "epoch": 0.3460694423568315, "flos": 38546190334080.0, "grad_norm": 1.7746130503339408, "language_loss": 0.7232182, "learning_rate": 3.040419101844869e-06, "loss": 0.74477637, "num_input_tokens_seen": 123739250, "step": 5756, "time_per_iteration": 2.8805603981018066 }, { "auxiliary_loss_clip": 0.01055943, "auxiliary_loss_mlp": 0.01012753, "balance_loss_clip": 1.03647125, "balance_loss_mlp": 1.01088166, "epoch": 0.3461295656094995, "flos": 72081479704320.0, "grad_norm": 0.7176054236110851, "language_loss": 0.62659568, "learning_rate": 3.040086466790207e-06, "loss": 0.64728266, "num_input_tokens_seen": 123802845, "step": 5757, "time_per_iteration": 3.21248197555542 }, { "auxiliary_loss_clip": 0.0103445, "auxiliary_loss_mlp": 0.00755471, "balance_loss_clip": 1.03495657, "balance_loss_mlp": 1.0016396, "epoch": 0.34618968886216744, "flos": 65460089571840.0, "grad_norm": 0.8171010225304897, "language_loss": 0.59206927, "learning_rate": 3.039753792295362e-06, "loss": 0.60996854, "num_input_tokens_seen": 123861805, "step": 5758, "time_per_iteration": 3.2514266967773438 }, { "auxiliary_loss_clip": 0.01122832, "auxiliary_loss_mlp": 0.01042223, "balance_loss_clip": 1.05849838, "balance_loss_mlp": 1.02783418, "epoch": 0.3462498121148354, "flos": 23472542960640.0, "grad_norm": 1.8827972101732287, "language_loss": 0.71806967, "learning_rate": 3.0394210783729487e-06, "loss": 0.73972023, "num_input_tokens_seen": 123881820, "step": 5759, "time_per_iteration": 2.943061351776123 }, { "auxiliary_loss_clip": 0.0108272, "auxiliary_loss_mlp": 0.01061154, "balance_loss_clip": 1.0455631, "balance_loss_mlp": 1.04352307, "epoch": 0.3463099353675034, "flos": 24170790418560.0, "grad_norm": 1.9206924983950955, "language_loss": 0.83097923, "learning_rate": 3.0390883250355836e-06, "loss": 0.85241801, "num_input_tokens_seen": 123903700, "step": 5760, "time_per_iteration": 2.8922929763793945 }, { "auxiliary_loss_clip": 0.01029416, "auxiliary_loss_mlp": 0.01010127, "balance_loss_clip": 1.02909803, "balance_loss_mlp": 1.00855386, "epoch": 0.34637005862017134, "flos": 63700609766400.0, "grad_norm": 0.8149802448400086, "language_loss": 0.56472003, "learning_rate": 3.0387555322958865e-06, "loss": 0.58511543, "num_input_tokens_seen": 123960075, "step": 5761, "time_per_iteration": 3.274470567703247 }, { "auxiliary_loss_clip": 0.01122229, "auxiliary_loss_mlp": 0.00773416, "balance_loss_clip": 1.04931128, "balance_loss_mlp": 1.00069964, "epoch": 0.3464301818728393, "flos": 13145532192000.0, "grad_norm": 2.486389460519204, "language_loss": 0.94996566, "learning_rate": 3.038422700166474e-06, "loss": 0.96892214, "num_input_tokens_seen": 123975805, "step": 5762, "time_per_iteration": 2.636906623840332 }, { "auxiliary_loss_clip": 0.01106692, "auxiliary_loss_mlp": 0.0104127, "balance_loss_clip": 1.04844642, "balance_loss_mlp": 1.02467608, "epoch": 0.34649030512550727, "flos": 29315173299840.0, "grad_norm": 1.8335548533403485, "language_loss": 0.69540495, "learning_rate": 3.0380898286599692e-06, "loss": 0.71688455, "num_input_tokens_seen": 123997530, "step": 5763, "time_per_iteration": 2.8476505279541016 }, { "auxiliary_loss_clip": 0.01125911, "auxiliary_loss_mlp": 0.01051478, "balance_loss_clip": 1.04963946, "balance_loss_mlp": 1.03319085, "epoch": 0.34655042837817523, "flos": 23730884553600.0, "grad_norm": 2.0043623648961195, "language_loss": 0.83985734, "learning_rate": 3.0377569177889945e-06, "loss": 0.86163127, "num_input_tokens_seen": 124016375, "step": 5764, "time_per_iteration": 2.693847417831421 }, { "auxiliary_loss_clip": 0.01103367, "auxiliary_loss_mlp": 0.01039514, "balance_loss_clip": 1.04989028, "balance_loss_mlp": 1.02363563, "epoch": 0.34661055163084326, "flos": 22054215553920.0, "grad_norm": 2.2905956292147045, "language_loss": 0.6769501, "learning_rate": 3.0374239675661722e-06, "loss": 0.69837892, "num_input_tokens_seen": 124033975, "step": 5765, "time_per_iteration": 2.7656123638153076 }, { "auxiliary_loss_clip": 0.01108658, "auxiliary_loss_mlp": 0.01045242, "balance_loss_clip": 1.05017447, "balance_loss_mlp": 1.0279808, "epoch": 0.3466706748835112, "flos": 21799213925760.0, "grad_norm": 2.7236728572511653, "language_loss": 0.77394044, "learning_rate": 3.03709097800413e-06, "loss": 0.79547942, "num_input_tokens_seen": 124051930, "step": 5766, "time_per_iteration": 2.7095906734466553 }, { "auxiliary_loss_clip": 0.01078684, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.04552221, "balance_loss_mlp": 1.02113521, "epoch": 0.3467307981361792, "flos": 19461680547840.0, "grad_norm": 1.6543575607114767, "language_loss": 0.73547316, "learning_rate": 3.0367579491154943e-06, "loss": 0.75661922, "num_input_tokens_seen": 124071220, "step": 5767, "time_per_iteration": 2.8161730766296387 }, { "auxiliary_loss_clip": 0.01111822, "auxiliary_loss_mlp": 0.01043875, "balance_loss_clip": 1.05307102, "balance_loss_mlp": 1.02734113, "epoch": 0.34679092138884715, "flos": 24827452905600.0, "grad_norm": 2.2530154082607776, "language_loss": 0.7832194, "learning_rate": 3.036424880912893e-06, "loss": 0.80477637, "num_input_tokens_seen": 124090140, "step": 5768, "time_per_iteration": 4.265673875808716 }, { "auxiliary_loss_clip": 0.01050543, "auxiliary_loss_mlp": 0.01012109, "balance_loss_clip": 1.0320363, "balance_loss_mlp": 1.0104636, "epoch": 0.3468510446415151, "flos": 63236070149760.0, "grad_norm": 0.7741250202123364, "language_loss": 0.57502627, "learning_rate": 3.036091773408956e-06, "loss": 0.59565282, "num_input_tokens_seen": 124152025, "step": 5769, "time_per_iteration": 3.2264139652252197 }, { "auxiliary_loss_clip": 0.01107195, "auxiliary_loss_mlp": 0.01044629, "balance_loss_clip": 1.04818511, "balance_loss_mlp": 1.02630615, "epoch": 0.3469111678941831, "flos": 12120713256960.0, "grad_norm": 2.34841523993127, "language_loss": 0.85575318, "learning_rate": 3.0357586266163154e-06, "loss": 0.87727135, "num_input_tokens_seen": 124165795, "step": 5770, "time_per_iteration": 2.7029645442962646 }, { "auxiliary_loss_clip": 0.01034922, "auxiliary_loss_mlp": 0.01007496, "balance_loss_clip": 1.02998519, "balance_loss_mlp": 1.00527906, "epoch": 0.34697129114685105, "flos": 65934110378880.0, "grad_norm": 0.7677707974310557, "language_loss": 0.59758615, "learning_rate": 3.0354254405476036e-06, "loss": 0.6180104, "num_input_tokens_seen": 124222925, "step": 5771, "time_per_iteration": 4.5523951053619385 }, { "auxiliary_loss_clip": 0.01127175, "auxiliary_loss_mlp": 0.01049141, "balance_loss_clip": 1.05249262, "balance_loss_mlp": 1.03320241, "epoch": 0.347031414399519, "flos": 34454205054720.0, "grad_norm": 1.9048919633537342, "language_loss": 0.71560407, "learning_rate": 3.0350922152154557e-06, "loss": 0.73736715, "num_input_tokens_seen": 124240915, "step": 5772, "time_per_iteration": 2.8108439445495605 }, { "auxiliary_loss_clip": 0.01108886, "auxiliary_loss_mlp": 0.0077423, "balance_loss_clip": 1.05118012, "balance_loss_mlp": 1.00077164, "epoch": 0.347091537652187, "flos": 26944135511040.0, "grad_norm": 1.679823492532721, "language_loss": 0.764898, "learning_rate": 3.034758950632507e-06, "loss": 0.78372908, "num_input_tokens_seen": 124262770, "step": 5773, "time_per_iteration": 2.813775062561035 }, { "auxiliary_loss_clip": 0.01128178, "auxiliary_loss_mlp": 0.01043067, "balance_loss_clip": 1.05019748, "balance_loss_mlp": 1.02674699, "epoch": 0.34715166090485494, "flos": 21142228216320.0, "grad_norm": 5.389351496516036, "language_loss": 0.70094979, "learning_rate": 3.034425646811396e-06, "loss": 0.72266221, "num_input_tokens_seen": 124280950, "step": 5774, "time_per_iteration": 4.167816162109375 }, { "auxiliary_loss_clip": 0.01113209, "auxiliary_loss_mlp": 0.00774032, "balance_loss_clip": 1.05024052, "balance_loss_mlp": 1.00071549, "epoch": 0.3472117841575229, "flos": 23478001827840.0, "grad_norm": 1.6687380405540382, "language_loss": 0.76013231, "learning_rate": 3.0340923037647602e-06, "loss": 0.77900469, "num_input_tokens_seen": 124299540, "step": 5775, "time_per_iteration": 2.739729404449463 }, { "auxiliary_loss_clip": 0.01114926, "auxiliary_loss_mlp": 0.01046919, "balance_loss_clip": 1.0480268, "balance_loss_mlp": 1.02965736, "epoch": 0.34727190741019087, "flos": 17492806408320.0, "grad_norm": 2.598065011523741, "language_loss": 0.77565503, "learning_rate": 3.0337589215052404e-06, "loss": 0.79727352, "num_input_tokens_seen": 124316285, "step": 5776, "time_per_iteration": 2.7339272499084473 }, { "auxiliary_loss_clip": 0.01036494, "auxiliary_loss_mlp": 0.01014475, "balance_loss_clip": 1.02741766, "balance_loss_mlp": 1.01280594, "epoch": 0.34733203066285884, "flos": 65265491640960.0, "grad_norm": 0.8358378555600092, "language_loss": 0.63272905, "learning_rate": 3.033425500045478e-06, "loss": 0.65323877, "num_input_tokens_seen": 124376650, "step": 5777, "time_per_iteration": 3.257993459701538 }, { "auxiliary_loss_clip": 0.01098381, "auxiliary_loss_mlp": 0.01045801, "balance_loss_clip": 1.04933393, "balance_loss_mlp": 1.02975535, "epoch": 0.3473921539155268, "flos": 28658726294400.0, "grad_norm": 3.5330364681008755, "language_loss": 0.6504612, "learning_rate": 3.033092039398119e-06, "loss": 0.67190301, "num_input_tokens_seen": 124396475, "step": 5778, "time_per_iteration": 2.775846481323242 }, { "auxiliary_loss_clip": 0.01113961, "auxiliary_loss_mlp": 0.01054607, "balance_loss_clip": 1.04786038, "balance_loss_mlp": 1.03903246, "epoch": 0.3474522771681948, "flos": 40836895355520.0, "grad_norm": 2.3967507755094064, "language_loss": 0.71278334, "learning_rate": 3.0327585395758046e-06, "loss": 0.73446906, "num_input_tokens_seen": 124416480, "step": 5779, "time_per_iteration": 2.7915873527526855 }, { "auxiliary_loss_clip": 0.01142932, "auxiliary_loss_mlp": 0.01053692, "balance_loss_clip": 1.05395269, "balance_loss_mlp": 1.03762269, "epoch": 0.3475124004208628, "flos": 24608577381120.0, "grad_norm": 2.0452202029673043, "language_loss": 0.62873107, "learning_rate": 3.0324250005911837e-06, "loss": 0.65069735, "num_input_tokens_seen": 124435950, "step": 5780, "time_per_iteration": 2.6743876934051514 }, { "auxiliary_loss_clip": 0.01095736, "auxiliary_loss_mlp": 0.01050069, "balance_loss_clip": 1.04648292, "balance_loss_mlp": 1.03446484, "epoch": 0.34757252367353075, "flos": 22711309004160.0, "grad_norm": 1.6009150193459345, "language_loss": 0.72167897, "learning_rate": 3.0320914224569033e-06, "loss": 0.743137, "num_input_tokens_seen": 124455410, "step": 5781, "time_per_iteration": 2.749302625656128 }, { "auxiliary_loss_clip": 0.01073898, "auxiliary_loss_mlp": 0.01052117, "balance_loss_clip": 1.040519, "balance_loss_mlp": 1.03405714, "epoch": 0.3476326469261987, "flos": 19828184970240.0, "grad_norm": 2.5507599846278644, "language_loss": 0.76966107, "learning_rate": 3.031757805185612e-06, "loss": 0.79092121, "num_input_tokens_seen": 124474870, "step": 5782, "time_per_iteration": 2.801867723464966 }, { "auxiliary_loss_clip": 0.01108825, "auxiliary_loss_mlp": 0.01037018, "balance_loss_clip": 1.05032897, "balance_loss_mlp": 1.02193785, "epoch": 0.3476927701788667, "flos": 19938107566080.0, "grad_norm": 2.367934041085959, "language_loss": 0.62506068, "learning_rate": 3.0314241487899622e-06, "loss": 0.64651906, "num_input_tokens_seen": 124494105, "step": 5783, "time_per_iteration": 2.709778070449829 }, { "auxiliary_loss_clip": 0.01092863, "auxiliary_loss_mlp": 0.01031024, "balance_loss_clip": 1.04997683, "balance_loss_mlp": 1.0163672, "epoch": 0.34775289343153465, "flos": 20735108490240.0, "grad_norm": 1.7498214415914104, "language_loss": 0.88513505, "learning_rate": 3.031090453282605e-06, "loss": 0.90637398, "num_input_tokens_seen": 124512030, "step": 5784, "time_per_iteration": 2.769317150115967 }, { "auxiliary_loss_clip": 0.01089006, "auxiliary_loss_mlp": 0.01036783, "balance_loss_clip": 1.05206084, "balance_loss_mlp": 1.02097547, "epoch": 0.3478130166842026, "flos": 19354846521600.0, "grad_norm": 1.703369857104052, "language_loss": 0.81740022, "learning_rate": 3.0307567186761946e-06, "loss": 0.83865809, "num_input_tokens_seen": 124530980, "step": 5785, "time_per_iteration": 2.791860818862915 }, { "auxiliary_loss_clip": 0.01106676, "auxiliary_loss_mlp": 0.01040592, "balance_loss_clip": 1.04747128, "balance_loss_mlp": 1.02563095, "epoch": 0.3478731399368706, "flos": 22051198811520.0, "grad_norm": 1.689422515624071, "language_loss": 0.80540836, "learning_rate": 3.0304229449833862e-06, "loss": 0.82688099, "num_input_tokens_seen": 124549330, "step": 5786, "time_per_iteration": 2.7547576427459717 }, { "auxiliary_loss_clip": 0.0113505, "auxiliary_loss_mlp": 0.00773369, "balance_loss_clip": 1.05242872, "balance_loss_mlp": 1.00073981, "epoch": 0.34793326318953854, "flos": 18041449720320.0, "grad_norm": 2.7072955912962686, "language_loss": 0.74945676, "learning_rate": 3.030089132216836e-06, "loss": 0.76854098, "num_input_tokens_seen": 124567200, "step": 5787, "time_per_iteration": 2.592688798904419 }, { "auxiliary_loss_clip": 0.01102822, "auxiliary_loss_mlp": 0.00773627, "balance_loss_clip": 1.04294109, "balance_loss_mlp": 1.00074553, "epoch": 0.3479933864422065, "flos": 29314670509440.0, "grad_norm": 1.9068485918966191, "language_loss": 0.81542754, "learning_rate": 3.029755280389203e-06, "loss": 0.83419204, "num_input_tokens_seen": 124587025, "step": 5788, "time_per_iteration": 2.84395694732666 }, { "auxiliary_loss_clip": 0.01144785, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.0562067, "balance_loss_mlp": 1.02140832, "epoch": 0.3480535096948745, "flos": 20120713332480.0, "grad_norm": 2.2432452775203964, "language_loss": 0.85701168, "learning_rate": 3.029421389513147e-06, "loss": 0.87883425, "num_input_tokens_seen": 124605860, "step": 5789, "time_per_iteration": 2.630535125732422 }, { "auxiliary_loss_clip": 0.01130136, "auxiliary_loss_mlp": 0.01056162, "balance_loss_clip": 1.05231345, "balance_loss_mlp": 1.04007459, "epoch": 0.34811363294754244, "flos": 18548974938240.0, "grad_norm": 5.008598067350991, "language_loss": 0.8502599, "learning_rate": 3.029087459601328e-06, "loss": 0.87212288, "num_input_tokens_seen": 124624270, "step": 5790, "time_per_iteration": 2.6052823066711426 }, { "auxiliary_loss_clip": 0.01130643, "auxiliary_loss_mlp": 0.01044731, "balance_loss_clip": 1.05373776, "balance_loss_mlp": 1.02904904, "epoch": 0.3481737562002104, "flos": 26870303105280.0, "grad_norm": 1.9264082121319324, "language_loss": 0.80832046, "learning_rate": 3.0287534906664097e-06, "loss": 0.83007419, "num_input_tokens_seen": 124644005, "step": 5791, "time_per_iteration": 2.7190260887145996 }, { "auxiliary_loss_clip": 0.01125872, "auxiliary_loss_mlp": 0.0104286, "balance_loss_clip": 1.04968619, "balance_loss_mlp": 1.02690983, "epoch": 0.3482338794528784, "flos": 28908664104960.0, "grad_norm": 2.4373031068755022, "language_loss": 0.77855796, "learning_rate": 3.028419482721056e-06, "loss": 0.80024529, "num_input_tokens_seen": 124663020, "step": 5792, "time_per_iteration": 2.7223403453826904 }, { "auxiliary_loss_clip": 0.01108923, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.04401517, "balance_loss_mlp": 1.01922882, "epoch": 0.3482940027055464, "flos": 22200767043840.0, "grad_norm": 1.6684091148270528, "language_loss": 0.81824791, "learning_rate": 3.0280854357779325e-06, "loss": 0.8396861, "num_input_tokens_seen": 124682975, "step": 5793, "time_per_iteration": 2.84191632270813 }, { "auxiliary_loss_clip": 0.01124823, "auxiliary_loss_mlp": 0.01055766, "balance_loss_clip": 1.05077863, "balance_loss_mlp": 1.0392313, "epoch": 0.34835412595821436, "flos": 20302708567680.0, "grad_norm": 1.8786694421525794, "language_loss": 0.7607373, "learning_rate": 3.027751349849706e-06, "loss": 0.78254318, "num_input_tokens_seen": 124701340, "step": 5794, "time_per_iteration": 2.707648515701294 }, { "auxiliary_loss_clip": 0.01123664, "auxiliary_loss_mlp": 0.01044013, "balance_loss_clip": 1.04820764, "balance_loss_mlp": 1.02735913, "epoch": 0.3484142492108823, "flos": 20449691020800.0, "grad_norm": 2.79979085265216, "language_loss": 0.57190084, "learning_rate": 3.0274172249490456e-06, "loss": 0.59357756, "num_input_tokens_seen": 124719165, "step": 5795, "time_per_iteration": 2.6533401012420654 }, { "auxiliary_loss_clip": 0.01106011, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.04720807, "balance_loss_mlp": 1.02177811, "epoch": 0.3484743724635503, "flos": 24352929308160.0, "grad_norm": 2.0564463844351546, "language_loss": 0.82218957, "learning_rate": 3.0270830610886213e-06, "loss": 0.84361899, "num_input_tokens_seen": 124738670, "step": 5796, "time_per_iteration": 2.6823246479034424 }, { "auxiliary_loss_clip": 0.01120404, "auxiliary_loss_mlp": 0.01034067, "balance_loss_clip": 1.04927754, "balance_loss_mlp": 1.0192616, "epoch": 0.34853449571621825, "flos": 24353001135360.0, "grad_norm": 1.9927036097023587, "language_loss": 0.83429003, "learning_rate": 3.0267488582811033e-06, "loss": 0.85583472, "num_input_tokens_seen": 124758760, "step": 5797, "time_per_iteration": 2.7048346996307373 }, { "auxiliary_loss_clip": 0.01132676, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.05049801, "balance_loss_mlp": 1.02151191, "epoch": 0.3485946189688862, "flos": 27267690245760.0, "grad_norm": 1.9361964581914621, "language_loss": 0.73449033, "learning_rate": 3.026414616539167e-06, "loss": 0.75618768, "num_input_tokens_seen": 124777765, "step": 5798, "time_per_iteration": 2.6807782649993896 }, { "auxiliary_loss_clip": 0.01135458, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.04995012, "balance_loss_mlp": 1.02815914, "epoch": 0.3486547422215542, "flos": 20156695781760.0, "grad_norm": 2.5738259800272725, "language_loss": 0.76111758, "learning_rate": 3.026080335875485e-06, "loss": 0.78291941, "num_input_tokens_seen": 124796775, "step": 5799, "time_per_iteration": 2.629671096801758 }, { "auxiliary_loss_clip": 0.01073192, "auxiliary_loss_mlp": 0.01035978, "balance_loss_clip": 1.05208993, "balance_loss_mlp": 1.02083826, "epoch": 0.34871486547422215, "flos": 20230348619520.0, "grad_norm": 2.242229362705527, "language_loss": 0.75801086, "learning_rate": 3.025746016302734e-06, "loss": 0.77910256, "num_input_tokens_seen": 124815825, "step": 5800, "time_per_iteration": 3.047725200653076 }, { "auxiliary_loss_clip": 0.01112927, "auxiliary_loss_mlp": 0.00774006, "balance_loss_clip": 1.04720354, "balance_loss_mlp": 1.00079536, "epoch": 0.3487749887268901, "flos": 44053234882560.0, "grad_norm": 2.6257316922509286, "language_loss": 0.67468953, "learning_rate": 3.025411657833591e-06, "loss": 0.69355887, "num_input_tokens_seen": 124838420, "step": 5801, "time_per_iteration": 3.2364816665649414 }, { "auxiliary_loss_clip": 0.01103773, "auxiliary_loss_mlp": 0.010448, "balance_loss_clip": 1.04506934, "balance_loss_mlp": 1.028754, "epoch": 0.3488351119795581, "flos": 23295144666240.0, "grad_norm": 1.8428676315803219, "language_loss": 0.76738638, "learning_rate": 3.025077260480735e-06, "loss": 0.78887206, "num_input_tokens_seen": 124857320, "step": 5802, "time_per_iteration": 2.7959024906158447 }, { "auxiliary_loss_clip": 0.01053855, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.03989601, "balance_loss_mlp": 1.02219605, "epoch": 0.34889523523222604, "flos": 19934839428480.0, "grad_norm": 1.7816673584343024, "language_loss": 0.78991377, "learning_rate": 3.0247428242568474e-06, "loss": 0.81082606, "num_input_tokens_seen": 124875685, "step": 5803, "time_per_iteration": 2.8440747261047363 }, { "auxiliary_loss_clip": 0.01111548, "auxiliary_loss_mlp": 0.00774436, "balance_loss_clip": 1.04601288, "balance_loss_mlp": 1.00073576, "epoch": 0.348955358484894, "flos": 30446179816320.0, "grad_norm": 6.169621760932873, "language_loss": 0.67899323, "learning_rate": 3.0244083491746085e-06, "loss": 0.69785309, "num_input_tokens_seen": 124895960, "step": 5804, "time_per_iteration": 2.8011341094970703 }, { "auxiliary_loss_clip": 0.01109039, "auxiliary_loss_mlp": 0.01046207, "balance_loss_clip": 1.05153811, "balance_loss_mlp": 1.0306263, "epoch": 0.349015481737562, "flos": 17999972490240.0, "grad_norm": 1.9366950093174176, "language_loss": 0.75972986, "learning_rate": 3.024073835246702e-06, "loss": 0.78128237, "num_input_tokens_seen": 124914140, "step": 5805, "time_per_iteration": 2.735410213470459 }, { "auxiliary_loss_clip": 0.01085261, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.040416, "balance_loss_mlp": 1.0230304, "epoch": 0.34907560499023, "flos": 27198490694400.0, "grad_norm": 2.3089286954803194, "language_loss": 0.67154014, "learning_rate": 3.023739282485814e-06, "loss": 0.69277781, "num_input_tokens_seen": 124934180, "step": 5806, "time_per_iteration": 2.793893575668335 }, { "auxiliary_loss_clip": 0.01122813, "auxiliary_loss_mlp": 0.0104012, "balance_loss_clip": 1.05324221, "balance_loss_mlp": 1.02445614, "epoch": 0.34913572824289796, "flos": 30226873328640.0, "grad_norm": 1.5212397526739, "language_loss": 0.71703929, "learning_rate": 3.023404690904629e-06, "loss": 0.73866862, "num_input_tokens_seen": 124956060, "step": 5807, "time_per_iteration": 2.7225730419158936 }, { "auxiliary_loss_clip": 0.01135343, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.04923332, "balance_loss_mlp": 1.02102923, "epoch": 0.3491958514955659, "flos": 29971907614080.0, "grad_norm": 2.9062872704377125, "language_loss": 0.7383548, "learning_rate": 3.0230700605158364e-06, "loss": 0.76007676, "num_input_tokens_seen": 124976070, "step": 5808, "time_per_iteration": 4.38737154006958 }, { "auxiliary_loss_clip": 0.01133483, "auxiliary_loss_mlp": 0.01047071, "balance_loss_clip": 1.05228174, "balance_loss_mlp": 1.03241384, "epoch": 0.3492559747482339, "flos": 22783273902720.0, "grad_norm": 1.513097370663534, "language_loss": 0.84501046, "learning_rate": 3.0227353913321238e-06, "loss": 0.86681598, "num_input_tokens_seen": 124996995, "step": 5809, "time_per_iteration": 2.629246711730957 }, { "auxiliary_loss_clip": 0.01106316, "auxiliary_loss_mlp": 0.01034055, "balance_loss_clip": 1.04668331, "balance_loss_mlp": 1.01995289, "epoch": 0.34931609800090185, "flos": 26068022881920.0, "grad_norm": 2.856878325415132, "language_loss": 0.80759805, "learning_rate": 3.0224006833661835e-06, "loss": 0.82900178, "num_input_tokens_seen": 125015600, "step": 5810, "time_per_iteration": 2.815232276916504 }, { "auxiliary_loss_clip": 0.01134295, "auxiliary_loss_mlp": 0.01039591, "balance_loss_clip": 1.05105019, "balance_loss_mlp": 1.02539277, "epoch": 0.3493762212535698, "flos": 29242023252480.0, "grad_norm": 1.9587859815348794, "language_loss": 0.75694251, "learning_rate": 3.0220659366307057e-06, "loss": 0.7786814, "num_input_tokens_seen": 125035290, "step": 5811, "time_per_iteration": 4.295617580413818 }, { "auxiliary_loss_clip": 0.0111498, "auxiliary_loss_mlp": 0.01040701, "balance_loss_clip": 1.04791081, "balance_loss_mlp": 1.02616942, "epoch": 0.3494363445062378, "flos": 27126058919040.0, "grad_norm": 1.5951936061604581, "language_loss": 0.80199474, "learning_rate": 3.021731151138386e-06, "loss": 0.82355154, "num_input_tokens_seen": 125057130, "step": 5812, "time_per_iteration": 2.8571486473083496 }, { "auxiliary_loss_clip": 0.0106966, "auxiliary_loss_mlp": 0.01038506, "balance_loss_clip": 1.04193187, "balance_loss_mlp": 1.02299738, "epoch": 0.34949646775890575, "flos": 12276207233280.0, "grad_norm": 1.932575417997546, "language_loss": 0.69221139, "learning_rate": 3.021396326901918e-06, "loss": 0.71329308, "num_input_tokens_seen": 125073720, "step": 5813, "time_per_iteration": 4.446147441864014 }, { "auxiliary_loss_clip": 0.01101223, "auxiliary_loss_mlp": 0.00772918, "balance_loss_clip": 1.04168797, "balance_loss_mlp": 1.00074911, "epoch": 0.3495565910115737, "flos": 17165516659200.0, "grad_norm": 2.168508070197816, "language_loss": 0.76586467, "learning_rate": 3.0210614639339998e-06, "loss": 0.7846061, "num_input_tokens_seen": 125090635, "step": 5814, "time_per_iteration": 2.698594331741333 }, { "auxiliary_loss_clip": 0.01114737, "auxiliary_loss_mlp": 0.00773337, "balance_loss_clip": 1.05010188, "balance_loss_mlp": 1.00060046, "epoch": 0.3496167142642417, "flos": 26465661417600.0, "grad_norm": 1.9777422761312171, "language_loss": 0.84760284, "learning_rate": 3.020726562247328e-06, "loss": 0.86648357, "num_input_tokens_seen": 125110070, "step": 5815, "time_per_iteration": 2.7839486598968506 }, { "auxiliary_loss_clip": 0.01117022, "auxiliary_loss_mlp": 0.01031007, "balance_loss_clip": 1.04850423, "balance_loss_mlp": 1.01695168, "epoch": 0.34967683751690964, "flos": 17414843938560.0, "grad_norm": 2.1137892099104674, "language_loss": 0.77541941, "learning_rate": 3.0203916218546024e-06, "loss": 0.79689968, "num_input_tokens_seen": 125125730, "step": 5816, "time_per_iteration": 2.6244633197784424 }, { "auxiliary_loss_clip": 0.01122041, "auxiliary_loss_mlp": 0.01042966, "balance_loss_clip": 1.05198002, "balance_loss_mlp": 1.0282141, "epoch": 0.3497369607695776, "flos": 22600021691520.0, "grad_norm": 2.2643435778821246, "language_loss": 0.5898062, "learning_rate": 3.0200566427685246e-06, "loss": 0.61145627, "num_input_tokens_seen": 125146195, "step": 5817, "time_per_iteration": 2.676058530807495 }, { "auxiliary_loss_clip": 0.01065616, "auxiliary_loss_mlp": 0.01004328, "balance_loss_clip": 1.03704262, "balance_loss_mlp": 1.00290895, "epoch": 0.34979708402224563, "flos": 68529374818560.0, "grad_norm": 0.8661744616347857, "language_loss": 0.59915632, "learning_rate": 3.0197216250017975e-06, "loss": 0.61985576, "num_input_tokens_seen": 125207790, "step": 5818, "time_per_iteration": 3.2298331260681152 }, { "auxiliary_loss_clip": 0.0109396, "auxiliary_loss_mlp": 0.01044055, "balance_loss_clip": 1.04599476, "balance_loss_mlp": 1.02892733, "epoch": 0.3498572072749136, "flos": 18989634988800.0, "grad_norm": 2.0582091611638713, "language_loss": 0.83473527, "learning_rate": 3.019386568567123e-06, "loss": 0.85611546, "num_input_tokens_seen": 125226220, "step": 5819, "time_per_iteration": 2.6558237075805664 }, { "auxiliary_loss_clip": 0.01106439, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.04502416, "balance_loss_mlp": 1.01987886, "epoch": 0.34991733052758156, "flos": 27818883423360.0, "grad_norm": 1.848700539441483, "language_loss": 0.7078613, "learning_rate": 3.0190514734772083e-06, "loss": 0.72926915, "num_input_tokens_seen": 125247485, "step": 5820, "time_per_iteration": 2.703023672103882 }, { "auxiliary_loss_clip": 0.01122902, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04821718, "balance_loss_mlp": 1.02288496, "epoch": 0.3499774537802495, "flos": 33584197737600.0, "grad_norm": 1.691680241057735, "language_loss": 0.70418453, "learning_rate": 3.018716339744759e-06, "loss": 0.7257812, "num_input_tokens_seen": 125268625, "step": 5821, "time_per_iteration": 2.7258172035217285 }, { "auxiliary_loss_clip": 0.01128016, "auxiliary_loss_mlp": 0.01045237, "balance_loss_clip": 1.05040097, "balance_loss_mlp": 1.02945328, "epoch": 0.3500375770329175, "flos": 23476744851840.0, "grad_norm": 3.022669367007059, "language_loss": 0.73552108, "learning_rate": 3.0183811673824842e-06, "loss": 0.75725359, "num_input_tokens_seen": 125287530, "step": 5822, "time_per_iteration": 2.6288442611694336 }, { "auxiliary_loss_clip": 0.01111612, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.04867673, "balance_loss_mlp": 1.0193131, "epoch": 0.35009770028558546, "flos": 19026048401280.0, "grad_norm": 13.86145468617928, "language_loss": 0.78286207, "learning_rate": 3.018045956403094e-06, "loss": 0.80432606, "num_input_tokens_seen": 125307020, "step": 5823, "time_per_iteration": 2.585644245147705 }, { "auxiliary_loss_clip": 0.01050549, "auxiliary_loss_mlp": 0.01002993, "balance_loss_clip": 1.03169346, "balance_loss_mlp": 1.00141954, "epoch": 0.3501578235382534, "flos": 68351868783360.0, "grad_norm": 0.7268668465066358, "language_loss": 0.59232962, "learning_rate": 3.017710706819298e-06, "loss": 0.61286497, "num_input_tokens_seen": 125370445, "step": 5824, "time_per_iteration": 3.2155251502990723 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01041197, "balance_loss_clip": 1.04737854, "balance_loss_mlp": 1.02561092, "epoch": 0.3502179467909214, "flos": 21250893836160.0, "grad_norm": 3.9873136748139126, "language_loss": 0.84533477, "learning_rate": 3.017375418643811e-06, "loss": 0.86685359, "num_input_tokens_seen": 125388900, "step": 5825, "time_per_iteration": 2.687849998474121 }, { "auxiliary_loss_clip": 0.01123129, "auxiliary_loss_mlp": 0.00772852, "balance_loss_clip": 1.04982102, "balance_loss_mlp": 1.00084817, "epoch": 0.35027807004358935, "flos": 11942955826560.0, "grad_norm": 3.7970216760931654, "language_loss": 0.83272213, "learning_rate": 3.0170400918893464e-06, "loss": 0.85168195, "num_input_tokens_seen": 125402675, "step": 5826, "time_per_iteration": 2.623713970184326 }, { "auxiliary_loss_clip": 0.01108751, "auxiliary_loss_mlp": 0.01045941, "balance_loss_clip": 1.04680669, "balance_loss_mlp": 1.0308249, "epoch": 0.3503381932962573, "flos": 21470918595840.0, "grad_norm": 1.799644232020304, "language_loss": 0.8068707, "learning_rate": 3.0167047265686186e-06, "loss": 0.82841766, "num_input_tokens_seen": 125421360, "step": 5827, "time_per_iteration": 2.7149739265441895 }, { "auxiliary_loss_clip": 0.01080927, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.04276204, "balance_loss_mlp": 1.02641606, "epoch": 0.3503983165489253, "flos": 21251109317760.0, "grad_norm": 3.105536532024743, "language_loss": 0.71077561, "learning_rate": 3.0163693226943467e-06, "loss": 0.73199868, "num_input_tokens_seen": 125440000, "step": 5828, "time_per_iteration": 2.7468550205230713 }, { "auxiliary_loss_clip": 0.01126682, "auxiliary_loss_mlp": 0.01050267, "balance_loss_clip": 1.05060673, "balance_loss_mlp": 1.0323143, "epoch": 0.35045843980159325, "flos": 27815723026560.0, "grad_norm": 2.750124615693701, "language_loss": 0.79695857, "learning_rate": 3.016033880279248e-06, "loss": 0.81872809, "num_input_tokens_seen": 125460390, "step": 5829, "time_per_iteration": 2.6937646865844727 }, { "auxiliary_loss_clip": 0.01096574, "auxiliary_loss_mlp": 0.01044418, "balance_loss_clip": 1.0481379, "balance_loss_mlp": 1.02766919, "epoch": 0.3505185630542612, "flos": 25921148169600.0, "grad_norm": 1.9090298023730403, "language_loss": 0.72606629, "learning_rate": 3.0156983993360417e-06, "loss": 0.74747616, "num_input_tokens_seen": 125478410, "step": 5830, "time_per_iteration": 2.7369346618652344 }, { "auxiliary_loss_clip": 0.01090166, "auxiliary_loss_mlp": 0.01037306, "balance_loss_clip": 1.04190445, "balance_loss_mlp": 1.02131414, "epoch": 0.35057868630692923, "flos": 20521763660160.0, "grad_norm": 2.5268343856675437, "language_loss": 0.88473773, "learning_rate": 3.0153628798774513e-06, "loss": 0.90601242, "num_input_tokens_seen": 125495975, "step": 5831, "time_per_iteration": 2.716801166534424 }, { "auxiliary_loss_clip": 0.01076431, "auxiliary_loss_mlp": 0.01046131, "balance_loss_clip": 1.04348278, "balance_loss_mlp": 1.03036547, "epoch": 0.3506388095595972, "flos": 20448649526400.0, "grad_norm": 2.8335622037275052, "language_loss": 0.78706706, "learning_rate": 3.0150273219161985e-06, "loss": 0.80829263, "num_input_tokens_seen": 125515035, "step": 5832, "time_per_iteration": 2.719874143600464 }, { "auxiliary_loss_clip": 0.01096023, "auxiliary_loss_mlp": 0.01049214, "balance_loss_clip": 1.04483593, "balance_loss_mlp": 1.0303669, "epoch": 0.35069893281226516, "flos": 23109665811840.0, "grad_norm": 2.771771323399588, "language_loss": 0.71084702, "learning_rate": 3.014691725465008e-06, "loss": 0.73229945, "num_input_tokens_seen": 125535555, "step": 5833, "time_per_iteration": 2.729029655456543 }, { "auxiliary_loss_clip": 0.0111933, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.04690456, "balance_loss_mlp": 1.02119827, "epoch": 0.35075905606493313, "flos": 27271999877760.0, "grad_norm": 1.4652984704802052, "language_loss": 0.80866987, "learning_rate": 3.014356090536606e-06, "loss": 0.830221, "num_input_tokens_seen": 125558195, "step": 5834, "time_per_iteration": 2.6999855041503906 }, { "auxiliary_loss_clip": 0.01086162, "auxiliary_loss_mlp": 0.01041057, "balance_loss_clip": 1.05142856, "balance_loss_mlp": 1.02516639, "epoch": 0.3508191793176011, "flos": 19128608709120.0, "grad_norm": 2.24398587431922, "language_loss": 0.84067535, "learning_rate": 3.0140204171437183e-06, "loss": 0.86194754, "num_input_tokens_seen": 125575375, "step": 5835, "time_per_iteration": 2.7401607036590576 }, { "auxiliary_loss_clip": 0.01072219, "auxiliary_loss_mlp": 0.0104369, "balance_loss_clip": 1.04324877, "balance_loss_mlp": 1.02816927, "epoch": 0.35087930257026906, "flos": 25557588662400.0, "grad_norm": 1.6286460178957367, "language_loss": 0.76643491, "learning_rate": 3.0136847052990754e-06, "loss": 0.78759408, "num_input_tokens_seen": 125596745, "step": 5836, "time_per_iteration": 2.767824649810791 }, { "auxiliary_loss_clip": 0.01095252, "auxiliary_loss_mlp": 0.01044499, "balance_loss_clip": 1.04785156, "balance_loss_mlp": 1.02751756, "epoch": 0.350939425822937, "flos": 18004246208640.0, "grad_norm": 2.0145924652365945, "language_loss": 0.77402902, "learning_rate": 3.0133489550154074e-06, "loss": 0.79542655, "num_input_tokens_seen": 125613980, "step": 5837, "time_per_iteration": 2.684300661087036 }, { "auxiliary_loss_clip": 0.01122261, "auxiliary_loss_mlp": 0.01044889, "balance_loss_clip": 1.04895687, "balance_loss_mlp": 1.02941537, "epoch": 0.350999549075605, "flos": 22273198819200.0, "grad_norm": 2.68275803808264, "language_loss": 0.67695981, "learning_rate": 3.0130131663054442e-06, "loss": 0.69863135, "num_input_tokens_seen": 125632100, "step": 5838, "time_per_iteration": 2.6679129600524902 }, { "auxiliary_loss_clip": 0.01133084, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.04808521, "balance_loss_mlp": 1.02538526, "epoch": 0.35105967232827295, "flos": 14392279307520.0, "grad_norm": 2.478699358378921, "language_loss": 0.83575064, "learning_rate": 3.0126773391819215e-06, "loss": 0.85749567, "num_input_tokens_seen": 125649190, "step": 5839, "time_per_iteration": 2.7186849117279053 }, { "auxiliary_loss_clip": 0.01125827, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.0484879, "balance_loss_mlp": 1.02930689, "epoch": 0.3511197955809409, "flos": 25082346792960.0, "grad_norm": 2.56286420283892, "language_loss": 0.58882701, "learning_rate": 3.012341473657572e-06, "loss": 0.61053669, "num_input_tokens_seen": 125668680, "step": 5840, "time_per_iteration": 2.7048165798187256 }, { "auxiliary_loss_clip": 0.01093858, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.0449121, "balance_loss_mlp": 1.02719963, "epoch": 0.3511799188336089, "flos": 25884160139520.0, "grad_norm": 2.762376787670534, "language_loss": 0.87442869, "learning_rate": 3.0120055697451322e-06, "loss": 0.89579934, "num_input_tokens_seen": 125686935, "step": 5841, "time_per_iteration": 2.763007402420044 }, { "auxiliary_loss_clip": 0.01116677, "auxiliary_loss_mlp": 0.01038697, "balance_loss_clip": 1.04990196, "balance_loss_mlp": 1.02083993, "epoch": 0.35124004208627685, "flos": 20083725302400.0, "grad_norm": 1.9868500880648916, "language_loss": 0.75116056, "learning_rate": 3.0116696274573406e-06, "loss": 0.77271438, "num_input_tokens_seen": 125707180, "step": 5842, "time_per_iteration": 2.703010082244873 }, { "auxiliary_loss_clip": 0.01124735, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.04863322, "balance_loss_mlp": 1.0302043, "epoch": 0.3513001653389448, "flos": 17783431349760.0, "grad_norm": 2.134458584945634, "language_loss": 0.68687361, "learning_rate": 3.0113336468069346e-06, "loss": 0.70857882, "num_input_tokens_seen": 125722780, "step": 5843, "time_per_iteration": 2.6459767818450928 }, { "auxiliary_loss_clip": 0.01135637, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.05054379, "balance_loss_mlp": 1.0305481, "epoch": 0.3513602885916128, "flos": 29387138198400.0, "grad_norm": 2.0610262324560984, "language_loss": 0.65392244, "learning_rate": 3.010997627806655e-06, "loss": 0.67574418, "num_input_tokens_seen": 125742110, "step": 5844, "time_per_iteration": 2.6542131900787354 }, { "auxiliary_loss_clip": 0.01119986, "auxiliary_loss_mlp": 0.01042575, "balance_loss_clip": 1.04791713, "balance_loss_mlp": 1.02620745, "epoch": 0.3514204118442808, "flos": 16179876483840.0, "grad_norm": 2.0120705985466394, "language_loss": 0.75180912, "learning_rate": 3.010661570469245e-06, "loss": 0.77343476, "num_input_tokens_seen": 125759980, "step": 5845, "time_per_iteration": 2.686753511428833 }, { "auxiliary_loss_clip": 0.01122626, "auxiliary_loss_mlp": 0.01043989, "balance_loss_clip": 1.0485301, "balance_loss_mlp": 1.02835488, "epoch": 0.35148053509694877, "flos": 23834665923840.0, "grad_norm": 4.021226487899694, "language_loss": 0.73548663, "learning_rate": 3.0103254748074465e-06, "loss": 0.7571528, "num_input_tokens_seen": 125772660, "step": 5846, "time_per_iteration": 2.67868971824646 }, { "auxiliary_loss_clip": 0.01094187, "auxiliary_loss_mlp": 0.01044379, "balance_loss_clip": 1.04565465, "balance_loss_mlp": 1.02834511, "epoch": 0.35154065834961673, "flos": 20991295267200.0, "grad_norm": 1.687499817432144, "language_loss": 0.756024, "learning_rate": 3.0099893408340046e-06, "loss": 0.77740967, "num_input_tokens_seen": 125791935, "step": 5847, "time_per_iteration": 2.749495267868042 }, { "auxiliary_loss_clip": 0.011087, "auxiliary_loss_mlp": 0.01034036, "balance_loss_clip": 1.04465413, "balance_loss_mlp": 1.01871789, "epoch": 0.3516007816022847, "flos": 33255471444480.0, "grad_norm": 2.8847551511625675, "language_loss": 0.71752924, "learning_rate": 3.009653168561666e-06, "loss": 0.73895657, "num_input_tokens_seen": 125813455, "step": 5848, "time_per_iteration": 4.367843151092529 }, { "auxiliary_loss_clip": 0.0111724, "auxiliary_loss_mlp": 0.01051356, "balance_loss_clip": 1.04754996, "balance_loss_mlp": 1.03528619, "epoch": 0.35166090485495266, "flos": 11726953390080.0, "grad_norm": 2.1303857634409455, "language_loss": 0.89211285, "learning_rate": 3.009316958003178e-06, "loss": 0.91379881, "num_input_tokens_seen": 125827660, "step": 5849, "time_per_iteration": 2.720156192779541 }, { "auxiliary_loss_clip": 0.01112345, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 1.04670548, "balance_loss_mlp": 1.01948714, "epoch": 0.3517210281076206, "flos": 22638446265600.0, "grad_norm": 5.671837642447228, "language_loss": 0.74645329, "learning_rate": 3.0089807091712897e-06, "loss": 0.76792872, "num_input_tokens_seen": 125846655, "step": 5850, "time_per_iteration": 5.769666910171509 }, { "auxiliary_loss_clip": 0.01124277, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.05061293, "balance_loss_mlp": 1.02304828, "epoch": 0.3517811513602886, "flos": 21322750993920.0, "grad_norm": 4.453824391316201, "language_loss": 0.75497609, "learning_rate": 3.0086444220787515e-06, "loss": 0.77661049, "num_input_tokens_seen": 125866290, "step": 5851, "time_per_iteration": 2.6903436183929443 }, { "auxiliary_loss_clip": 0.01109028, "auxiliary_loss_mlp": 0.01043585, "balance_loss_clip": 1.047647, "balance_loss_mlp": 1.02581048, "epoch": 0.35184127461295656, "flos": 21032880238080.0, "grad_norm": 2.6842208339362714, "language_loss": 0.8711859, "learning_rate": 3.0083080967383165e-06, "loss": 0.892712, "num_input_tokens_seen": 125884620, "step": 5852, "time_per_iteration": 4.37211275100708 }, { "auxiliary_loss_clip": 0.01134086, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.05088282, "balance_loss_mlp": 1.02020407, "epoch": 0.3519013978656245, "flos": 22455265881600.0, "grad_norm": 4.894656899057391, "language_loss": 0.67756367, "learning_rate": 3.007971733162737e-06, "loss": 0.69925427, "num_input_tokens_seen": 125902430, "step": 5853, "time_per_iteration": 2.6657445430755615 }, { "auxiliary_loss_clip": 0.0110992, "auxiliary_loss_mlp": 0.01035315, "balance_loss_clip": 1.04499912, "balance_loss_mlp": 1.01943672, "epoch": 0.3519615211182925, "flos": 13115295918720.0, "grad_norm": 1.9396695842158058, "language_loss": 0.80834955, "learning_rate": 3.0076353313647686e-06, "loss": 0.82980192, "num_input_tokens_seen": 125920570, "step": 5854, "time_per_iteration": 2.741804361343384 }, { "auxiliary_loss_clip": 0.0111683, "auxiliary_loss_mlp": 0.01035573, "balance_loss_clip": 1.05230534, "balance_loss_mlp": 1.02117872, "epoch": 0.35202164437096045, "flos": 19135144984320.0, "grad_norm": 2.236186864476635, "language_loss": 0.73234653, "learning_rate": 3.0072988913571666e-06, "loss": 0.75387061, "num_input_tokens_seen": 125939800, "step": 5855, "time_per_iteration": 2.730731725692749 }, { "auxiliary_loss_clip": 0.0113392, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.05024409, "balance_loss_mlp": 1.02407861, "epoch": 0.3520817676236284, "flos": 26542187343360.0, "grad_norm": 2.4482136775911427, "language_loss": 0.71000826, "learning_rate": 3.006962413152691e-06, "loss": 0.73173165, "num_input_tokens_seen": 125958720, "step": 5856, "time_per_iteration": 2.632906436920166 }, { "auxiliary_loss_clip": 0.01121339, "auxiliary_loss_mlp": 0.01047265, "balance_loss_clip": 1.0479008, "balance_loss_mlp": 1.03056359, "epoch": 0.3521418908762964, "flos": 44893472803200.0, "grad_norm": 1.9582827204032656, "language_loss": 0.61505377, "learning_rate": 3.0066258967640987e-06, "loss": 0.63673985, "num_input_tokens_seen": 125984310, "step": 5857, "time_per_iteration": 2.8992249965667725 }, { "auxiliary_loss_clip": 0.01126198, "auxiliary_loss_mlp": 0.0103782, "balance_loss_clip": 1.05141187, "balance_loss_mlp": 1.02197754, "epoch": 0.3522020141289644, "flos": 20187398931840.0, "grad_norm": 2.047463358229584, "language_loss": 0.73246485, "learning_rate": 3.006289342204152e-06, "loss": 0.75410509, "num_input_tokens_seen": 126002410, "step": 5858, "time_per_iteration": 2.6754567623138428 }, { "auxiliary_loss_clip": 0.01139705, "auxiliary_loss_mlp": 0.01044718, "balance_loss_clip": 1.05193448, "balance_loss_mlp": 1.028947, "epoch": 0.35226213738163237, "flos": 27563917708800.0, "grad_norm": 1.8174320112537778, "language_loss": 0.7662344, "learning_rate": 3.0059527494856126e-06, "loss": 0.78807867, "num_input_tokens_seen": 126022490, "step": 5859, "time_per_iteration": 2.6464414596557617 }, { "auxiliary_loss_clip": 0.01123734, "auxiliary_loss_mlp": 0.0104748, "balance_loss_clip": 1.05600715, "balance_loss_mlp": 1.03037381, "epoch": 0.35232226063430033, "flos": 22966310632320.0, "grad_norm": 2.0728265984729974, "language_loss": 0.71452159, "learning_rate": 3.0056161186212435e-06, "loss": 0.73623371, "num_input_tokens_seen": 126042895, "step": 5860, "time_per_iteration": 2.7567954063415527 }, { "auxiliary_loss_clip": 0.01107752, "auxiliary_loss_mlp": 0.01042463, "balance_loss_clip": 1.04505348, "balance_loss_mlp": 1.02517724, "epoch": 0.3523823838869683, "flos": 19168290259200.0, "grad_norm": 2.4820154826508896, "language_loss": 0.66456246, "learning_rate": 3.005279449623811e-06, "loss": 0.6860646, "num_input_tokens_seen": 126060130, "step": 5861, "time_per_iteration": 2.6954853534698486 }, { "auxiliary_loss_clip": 0.01114832, "auxiliary_loss_mlp": 0.01037396, "balance_loss_clip": 1.05085611, "balance_loss_mlp": 1.0220778, "epoch": 0.35244250713963626, "flos": 17930988420480.0, "grad_norm": 2.552495084661914, "language_loss": 0.66833258, "learning_rate": 3.0049427425060815e-06, "loss": 0.68985492, "num_input_tokens_seen": 126077850, "step": 5862, "time_per_iteration": 2.758626699447632 }, { "auxiliary_loss_clip": 0.01111543, "auxiliary_loss_mlp": 0.01046885, "balance_loss_clip": 1.04932082, "balance_loss_mlp": 1.02999306, "epoch": 0.35250263039230423, "flos": 21432529935360.0, "grad_norm": 2.001922070828984, "language_loss": 0.77027225, "learning_rate": 3.0046059972808215e-06, "loss": 0.79185653, "num_input_tokens_seen": 126095985, "step": 5863, "time_per_iteration": 2.692974328994751 }, { "auxiliary_loss_clip": 0.01124448, "auxiliary_loss_mlp": 0.01041257, "balance_loss_clip": 1.05029762, "balance_loss_mlp": 1.02602828, "epoch": 0.3525627536449722, "flos": 27416863428480.0, "grad_norm": 2.204178263750967, "language_loss": 0.75406265, "learning_rate": 3.0042692139608024e-06, "loss": 0.77571976, "num_input_tokens_seen": 126116070, "step": 5864, "time_per_iteration": 2.7303273677825928 }, { "auxiliary_loss_clip": 0.01124417, "auxiliary_loss_mlp": 0.01048097, "balance_loss_clip": 1.04847336, "balance_loss_mlp": 1.03237331, "epoch": 0.35262287689764016, "flos": 24789818430720.0, "grad_norm": 2.3571129928423713, "language_loss": 0.79312253, "learning_rate": 3.003932392558793e-06, "loss": 0.81484771, "num_input_tokens_seen": 126135205, "step": 5865, "time_per_iteration": 2.6439075469970703 }, { "auxiliary_loss_clip": 0.01136688, "auxiliary_loss_mlp": 0.01047929, "balance_loss_clip": 1.05626893, "balance_loss_mlp": 1.03143001, "epoch": 0.3526830001503081, "flos": 17821604528640.0, "grad_norm": 2.261768767041389, "language_loss": 0.81215894, "learning_rate": 3.0035955330875677e-06, "loss": 0.83400512, "num_input_tokens_seen": 126151895, "step": 5866, "time_per_iteration": 2.649991035461426 }, { "auxiliary_loss_clip": 0.01095064, "auxiliary_loss_mlp": 0.01040513, "balance_loss_clip": 1.04940605, "balance_loss_mlp": 1.0227983, "epoch": 0.3527431234029761, "flos": 18078114528000.0, "grad_norm": 2.4092573216113182, "language_loss": 0.84224141, "learning_rate": 3.0032586355598986e-06, "loss": 0.86359721, "num_input_tokens_seen": 126168515, "step": 5867, "time_per_iteration": 2.7634172439575195 }, { "auxiliary_loss_clip": 0.01142449, "auxiliary_loss_mlp": 0.01051484, "balance_loss_clip": 1.05421114, "balance_loss_mlp": 1.03525996, "epoch": 0.35280324665564405, "flos": 19427350124160.0, "grad_norm": 1.8115003163784764, "language_loss": 0.74367464, "learning_rate": 3.0029216999885613e-06, "loss": 0.76561391, "num_input_tokens_seen": 126186460, "step": 5868, "time_per_iteration": 2.5986721515655518 }, { "auxiliary_loss_clip": 0.01131163, "auxiliary_loss_mlp": 0.01040977, "balance_loss_clip": 1.05391645, "balance_loss_mlp": 1.02457356, "epoch": 0.352863369908312, "flos": 21504027957120.0, "grad_norm": 1.9536193185751474, "language_loss": 0.6105355, "learning_rate": 3.0025847263863327e-06, "loss": 0.63225693, "num_input_tokens_seen": 126206170, "step": 5869, "time_per_iteration": 2.6737887859344482 }, { "auxiliary_loss_clip": 0.0112854, "auxiliary_loss_mlp": 0.01048512, "balance_loss_clip": 1.05128717, "balance_loss_mlp": 1.03254998, "epoch": 0.35292349316098, "flos": 22309504490880.0, "grad_norm": 2.4234624332717347, "language_loss": 0.74279565, "learning_rate": 3.0022477147659917e-06, "loss": 0.76456618, "num_input_tokens_seen": 126225605, "step": 5870, "time_per_iteration": 2.6921114921569824 }, { "auxiliary_loss_clip": 0.01126478, "auxiliary_loss_mlp": 0.01039703, "balance_loss_clip": 1.05037582, "balance_loss_mlp": 1.02376485, "epoch": 0.352983616413648, "flos": 33109745967360.0, "grad_norm": 1.6641276231491144, "language_loss": 0.71796882, "learning_rate": 3.001910665140316e-06, "loss": 0.73963058, "num_input_tokens_seen": 126250230, "step": 5871, "time_per_iteration": 2.8457682132720947 }, { "auxiliary_loss_clip": 0.01120204, "auxiliary_loss_mlp": 0.01040363, "balance_loss_clip": 1.04829907, "balance_loss_mlp": 1.02547359, "epoch": 0.35304373966631597, "flos": 18696603836160.0, "grad_norm": 2.0001362497177233, "language_loss": 0.73279023, "learning_rate": 3.0015735775220873e-06, "loss": 0.75439584, "num_input_tokens_seen": 126268315, "step": 5872, "time_per_iteration": 2.6763055324554443 }, { "auxiliary_loss_clip": 0.01114426, "auxiliary_loss_mlp": 0.0077352, "balance_loss_clip": 1.04808497, "balance_loss_mlp": 1.00056779, "epoch": 0.35310386291898394, "flos": 23364954748800.0, "grad_norm": 1.9067005964756008, "language_loss": 0.82472706, "learning_rate": 3.001236451924089e-06, "loss": 0.84360659, "num_input_tokens_seen": 126288390, "step": 5873, "time_per_iteration": 2.7487120628356934 }, { "auxiliary_loss_clip": 0.0111852, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04805684, "balance_loss_mlp": 1.03743458, "epoch": 0.3531639861716519, "flos": 24461954064000.0, "grad_norm": 2.0747562837168956, "language_loss": 0.65867126, "learning_rate": 3.000899288359104e-06, "loss": 0.68040824, "num_input_tokens_seen": 126305750, "step": 5874, "time_per_iteration": 2.717100143432617 }, { "auxiliary_loss_clip": 0.01065517, "auxiliary_loss_mlp": 0.01018804, "balance_loss_clip": 1.04397154, "balance_loss_mlp": 1.01712346, "epoch": 0.35322410942431987, "flos": 70312446881280.0, "grad_norm": 0.7718710282270123, "language_loss": 0.61513722, "learning_rate": 3.000562086839917e-06, "loss": 0.63598049, "num_input_tokens_seen": 126362495, "step": 5875, "time_per_iteration": 3.1768009662628174 }, { "auxiliary_loss_clip": 0.0106968, "auxiliary_loss_mlp": 0.01053019, "balance_loss_clip": 1.04069328, "balance_loss_mlp": 1.03722405, "epoch": 0.35328423267698783, "flos": 19820894509440.0, "grad_norm": 1.9274751499515825, "language_loss": 0.79748046, "learning_rate": 3.0002248473793163e-06, "loss": 0.81870747, "num_input_tokens_seen": 126378320, "step": 5876, "time_per_iteration": 2.7911314964294434 }, { "auxiliary_loss_clip": 0.01038976, "auxiliary_loss_mlp": 0.00753375, "balance_loss_clip": 1.03853297, "balance_loss_mlp": 1.00146759, "epoch": 0.3533443559296558, "flos": 60826356391680.0, "grad_norm": 0.6715924709851474, "language_loss": 0.56771934, "learning_rate": 2.999887569990088e-06, "loss": 0.58564281, "num_input_tokens_seen": 126442735, "step": 5877, "time_per_iteration": 3.3190126419067383 }, { "auxiliary_loss_clip": 0.01106988, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.04755747, "balance_loss_mlp": 1.02150357, "epoch": 0.35340447918232376, "flos": 24755775315840.0, "grad_norm": 2.262624772342981, "language_loss": 0.72041059, "learning_rate": 2.999550254685024e-06, "loss": 0.74185729, "num_input_tokens_seen": 126463090, "step": 5878, "time_per_iteration": 2.769482135772705 }, { "auxiliary_loss_clip": 0.01111223, "auxiliary_loss_mlp": 0.01039233, "balance_loss_clip": 1.0494144, "balance_loss_mlp": 1.02333045, "epoch": 0.3534646024349917, "flos": 21796304924160.0, "grad_norm": 1.9529875004972157, "language_loss": 0.78282005, "learning_rate": 2.9992129014769136e-06, "loss": 0.80432463, "num_input_tokens_seen": 126482105, "step": 5879, "time_per_iteration": 2.7066614627838135 }, { "auxiliary_loss_clip": 0.01111375, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05344558, "balance_loss_mlp": 1.0287354, "epoch": 0.3535247256876597, "flos": 20012119539840.0, "grad_norm": 2.4774809869114547, "language_loss": 0.63312674, "learning_rate": 2.9988755103785493e-06, "loss": 0.65470898, "num_input_tokens_seen": 126502125, "step": 5880, "time_per_iteration": 2.87187123298645 }, { "auxiliary_loss_clip": 0.01116729, "auxiliary_loss_mlp": 0.01037267, "balance_loss_clip": 1.05014002, "balance_loss_mlp": 1.02067327, "epoch": 0.35358484894032766, "flos": 18187929383040.0, "grad_norm": 2.079670586085082, "language_loss": 0.65503716, "learning_rate": 2.998538081402727e-06, "loss": 0.67657715, "num_input_tokens_seen": 126521950, "step": 5881, "time_per_iteration": 2.701570510864258 }, { "auxiliary_loss_clip": 0.01119778, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.05182576, "balance_loss_mlp": 1.02047253, "epoch": 0.3536449721929956, "flos": 22820369673600.0, "grad_norm": 1.437925300063569, "language_loss": 0.75797737, "learning_rate": 2.998200614562239e-06, "loss": 0.77953088, "num_input_tokens_seen": 126542445, "step": 5882, "time_per_iteration": 2.713350772857666 }, { "auxiliary_loss_clip": 0.01112568, "auxiliary_loss_mlp": 0.01044857, "balance_loss_clip": 1.0485872, "balance_loss_mlp": 1.02591491, "epoch": 0.3537050954456636, "flos": 26432336574720.0, "grad_norm": 2.160470372067537, "language_loss": 0.70095098, "learning_rate": 2.9978631098698847e-06, "loss": 0.72252524, "num_input_tokens_seen": 126560690, "step": 5883, "time_per_iteration": 2.77695631980896 }, { "auxiliary_loss_clip": 0.01107169, "auxiliary_loss_mlp": 0.01040706, "balance_loss_clip": 1.04937398, "balance_loss_mlp": 1.02364671, "epoch": 0.3537652186983316, "flos": 17197153562880.0, "grad_norm": 3.3935912100169117, "language_loss": 0.78052664, "learning_rate": 2.9975255673384614e-06, "loss": 0.80200535, "num_input_tokens_seen": 126577620, "step": 5884, "time_per_iteration": 2.8704800605773926 }, { "auxiliary_loss_clip": 0.0111409, "auxiliary_loss_mlp": 0.01036742, "balance_loss_clip": 1.05093837, "balance_loss_mlp": 1.02157819, "epoch": 0.3538253419509996, "flos": 19536769929600.0, "grad_norm": 1.9052381201351025, "language_loss": 0.7519542, "learning_rate": 2.9971879869807673e-06, "loss": 0.77346253, "num_input_tokens_seen": 126596235, "step": 5885, "time_per_iteration": 2.74930477142334 }, { "auxiliary_loss_clip": 0.01088229, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04355764, "balance_loss_mlp": 1.02321255, "epoch": 0.35388546520366754, "flos": 12128578335360.0, "grad_norm": 3.360136520151105, "language_loss": 0.83904099, "learning_rate": 2.996850368809606e-06, "loss": 0.86033243, "num_input_tokens_seen": 126612830, "step": 5886, "time_per_iteration": 2.9362361431121826 }, { "auxiliary_loss_clip": 0.01139122, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.05223978, "balance_loss_mlp": 1.02178788, "epoch": 0.3539455884563355, "flos": 19678149861120.0, "grad_norm": 2.3342407880968765, "language_loss": 0.78239143, "learning_rate": 2.9965127128377787e-06, "loss": 0.8041774, "num_input_tokens_seen": 126630910, "step": 5887, "time_per_iteration": 4.157519340515137 }, { "auxiliary_loss_clip": 0.01079386, "auxiliary_loss_mlp": 0.01047635, "balance_loss_clip": 1.04380405, "balance_loss_mlp": 1.03155398, "epoch": 0.35400571170900347, "flos": 18072045129600.0, "grad_norm": 3.4693260211189614, "language_loss": 0.65532601, "learning_rate": 2.996175019078089e-06, "loss": 0.67659628, "num_input_tokens_seen": 126648365, "step": 5888, "time_per_iteration": 2.7693519592285156 }, { "auxiliary_loss_clip": 0.01108859, "auxiliary_loss_mlp": 0.01038745, "balance_loss_clip": 1.04853678, "balance_loss_mlp": 1.02278328, "epoch": 0.35406583496167143, "flos": 26068058795520.0, "grad_norm": 2.324375134725136, "language_loss": 0.77100271, "learning_rate": 2.9958372875433437e-06, "loss": 0.7924788, "num_input_tokens_seen": 126667500, "step": 5889, "time_per_iteration": 4.211338996887207 }, { "auxiliary_loss_clip": 0.0110217, "auxiliary_loss_mlp": 0.01041504, "balance_loss_clip": 1.05017257, "balance_loss_mlp": 1.0262332, "epoch": 0.3541259582143394, "flos": 19792453916160.0, "grad_norm": 2.074151752869495, "language_loss": 0.81132901, "learning_rate": 2.9954995182463478e-06, "loss": 0.83276576, "num_input_tokens_seen": 126686820, "step": 5890, "time_per_iteration": 4.248823642730713 }, { "auxiliary_loss_clip": 0.01112591, "auxiliary_loss_mlp": 0.01034659, "balance_loss_clip": 1.04692972, "balance_loss_mlp": 1.01979923, "epoch": 0.35418608146700736, "flos": 24022084112640.0, "grad_norm": 1.8036187380252735, "language_loss": 0.79384875, "learning_rate": 2.99516171119991e-06, "loss": 0.81532121, "num_input_tokens_seen": 126706965, "step": 5891, "time_per_iteration": 4.335815668106079 }, { "auxiliary_loss_clip": 0.01099264, "auxiliary_loss_mlp": 0.01046084, "balance_loss_clip": 1.04669261, "balance_loss_mlp": 1.0285244, "epoch": 0.35424620471967533, "flos": 12385770693120.0, "grad_norm": 2.015603194975926, "language_loss": 0.73404211, "learning_rate": 2.9948238664168415e-06, "loss": 0.75549555, "num_input_tokens_seen": 126724015, "step": 5892, "time_per_iteration": 2.760498046875 }, { "auxiliary_loss_clip": 0.01112321, "auxiliary_loss_mlp": 0.01041472, "balance_loss_clip": 1.04650092, "balance_loss_mlp": 1.02434158, "epoch": 0.3543063279723433, "flos": 19673624747520.0, "grad_norm": 2.094655212929219, "language_loss": 0.6720162, "learning_rate": 2.9944859839099518e-06, "loss": 0.6935541, "num_input_tokens_seen": 126737565, "step": 5893, "time_per_iteration": 2.671706199645996 }, { "auxiliary_loss_clip": 0.01084647, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.04317796, "balance_loss_mlp": 1.02440834, "epoch": 0.35436645122501126, "flos": 21909208348800.0, "grad_norm": 1.9115541405313234, "language_loss": 0.69860309, "learning_rate": 2.9941480636920533e-06, "loss": 0.71986485, "num_input_tokens_seen": 126756095, "step": 5894, "time_per_iteration": 2.720066785812378 }, { "auxiliary_loss_clip": 0.01111006, "auxiliary_loss_mlp": 0.00773076, "balance_loss_clip": 1.04764175, "balance_loss_mlp": 1.00055242, "epoch": 0.3544265744776792, "flos": 21719527603200.0, "grad_norm": 1.7998653616668008, "language_loss": 0.74833035, "learning_rate": 2.9938101057759615e-06, "loss": 0.76717114, "num_input_tokens_seen": 126775455, "step": 5895, "time_per_iteration": 2.8295304775238037 }, { "auxiliary_loss_clip": 0.011052, "auxiliary_loss_mlp": 0.01040742, "balance_loss_clip": 1.04288006, "balance_loss_mlp": 1.02485108, "epoch": 0.3544866977303472, "flos": 21213223447680.0, "grad_norm": 2.053997857318945, "language_loss": 0.83762395, "learning_rate": 2.993472110174491e-06, "loss": 0.85908329, "num_input_tokens_seen": 126792320, "step": 5896, "time_per_iteration": 2.723158836364746 }, { "auxiliary_loss_clip": 0.01111237, "auxiliary_loss_mlp": 0.00773671, "balance_loss_clip": 1.04756641, "balance_loss_mlp": 1.0005331, "epoch": 0.35454682098301515, "flos": 29311402371840.0, "grad_norm": 1.7709518935889355, "language_loss": 0.70033729, "learning_rate": 2.9931340769004576e-06, "loss": 0.71918637, "num_input_tokens_seen": 126813680, "step": 5897, "time_per_iteration": 2.744617223739624 }, { "auxiliary_loss_clip": 0.01111293, "auxiliary_loss_mlp": 0.01046033, "balance_loss_clip": 1.04829669, "balance_loss_mlp": 1.02830625, "epoch": 0.3546069442356832, "flos": 24316587722880.0, "grad_norm": 3.0934933528513344, "language_loss": 0.81546402, "learning_rate": 2.9927960059666816e-06, "loss": 0.83703721, "num_input_tokens_seen": 126834395, "step": 5898, "time_per_iteration": 2.77911376953125 }, { "auxiliary_loss_clip": 0.0113395, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.04943967, "balance_loss_mlp": 1.02232838, "epoch": 0.35466706748835114, "flos": 22857285876480.0, "grad_norm": 5.100417261000322, "language_loss": 0.73975331, "learning_rate": 2.9924578973859804e-06, "loss": 0.7614674, "num_input_tokens_seen": 126855145, "step": 5899, "time_per_iteration": 2.6566851139068604 }, { "auxiliary_loss_clip": 0.0113747, "auxiliary_loss_mlp": 0.00772565, "balance_loss_clip": 1.04971743, "balance_loss_mlp": 1.00056052, "epoch": 0.3547271907410191, "flos": 28330107742080.0, "grad_norm": 1.7615083390778834, "language_loss": 0.79458243, "learning_rate": 2.9921197511711763e-06, "loss": 0.81368273, "num_input_tokens_seen": 126873790, "step": 5900, "time_per_iteration": 2.6658642292022705 }, { "auxiliary_loss_clip": 0.0111331, "auxiliary_loss_mlp": 0.01044824, "balance_loss_clip": 1.04659319, "balance_loss_mlp": 1.0288384, "epoch": 0.35478731399368707, "flos": 23514092017920.0, "grad_norm": 2.160550694830747, "language_loss": 0.81303531, "learning_rate": 2.991781567335093e-06, "loss": 0.83461666, "num_input_tokens_seen": 126892865, "step": 5901, "time_per_iteration": 2.711568593978882 }, { "auxiliary_loss_clip": 0.01125037, "auxiliary_loss_mlp": 0.00772744, "balance_loss_clip": 1.05092883, "balance_loss_mlp": 1.00049663, "epoch": 0.35484743724635504, "flos": 18624315715200.0, "grad_norm": 2.0558354102165373, "language_loss": 0.75869077, "learning_rate": 2.9914433458905525e-06, "loss": 0.7776686, "num_input_tokens_seen": 126911935, "step": 5902, "time_per_iteration": 2.6833012104034424 }, { "auxiliary_loss_clip": 0.01123978, "auxiliary_loss_mlp": 0.01036322, "balance_loss_clip": 1.04852581, "balance_loss_mlp": 1.02142096, "epoch": 0.354907560499023, "flos": 17384499924480.0, "grad_norm": 2.534328384273088, "language_loss": 0.70550704, "learning_rate": 2.991105086850381e-06, "loss": 0.72711003, "num_input_tokens_seen": 126930040, "step": 5903, "time_per_iteration": 2.689303159713745 }, { "auxiliary_loss_clip": 0.01128401, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.05025887, "balance_loss_mlp": 1.02051437, "epoch": 0.35496768375169097, "flos": 19208546426880.0, "grad_norm": 3.3775979872187203, "language_loss": 0.7448622, "learning_rate": 2.9907667902274053e-06, "loss": 0.76651096, "num_input_tokens_seen": 126948390, "step": 5904, "time_per_iteration": 2.6360747814178467 }, { "auxiliary_loss_clip": 0.01113034, "auxiliary_loss_mlp": 0.00772738, "balance_loss_clip": 1.04721618, "balance_loss_mlp": 1.000543, "epoch": 0.35502780700435893, "flos": 18332792933760.0, "grad_norm": 3.051840518778985, "language_loss": 0.78653091, "learning_rate": 2.9904284560344536e-06, "loss": 0.80538863, "num_input_tokens_seen": 126964905, "step": 5905, "time_per_iteration": 2.8539419174194336 }, { "auxiliary_loss_clip": 0.01101916, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.04842138, "balance_loss_mlp": 1.02486014, "epoch": 0.3550879302570269, "flos": 15448555578240.0, "grad_norm": 18.846860460510154, "language_loss": 0.72740704, "learning_rate": 2.990090084284356e-06, "loss": 0.74881542, "num_input_tokens_seen": 126982000, "step": 5906, "time_per_iteration": 2.7013392448425293 }, { "auxiliary_loss_clip": 0.01109726, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.04908431, "balance_loss_mlp": 1.02265012, "epoch": 0.35514805350969486, "flos": 21979197999360.0, "grad_norm": 1.821131131528883, "language_loss": 0.74746358, "learning_rate": 2.9897516749899426e-06, "loss": 0.76895893, "num_input_tokens_seen": 126998390, "step": 5907, "time_per_iteration": 2.7603847980499268 }, { "auxiliary_loss_clip": 0.01062812, "auxiliary_loss_mlp": 0.01042872, "balance_loss_clip": 1.03682017, "balance_loss_mlp": 1.02463293, "epoch": 0.3552081767623628, "flos": 29861949104640.0, "grad_norm": 3.0473905008627775, "language_loss": 0.7563526, "learning_rate": 2.989413228164047e-06, "loss": 0.77740943, "num_input_tokens_seen": 127020220, "step": 5908, "time_per_iteration": 2.8653454780578613 }, { "auxiliary_loss_clip": 0.01114185, "auxiliary_loss_mlp": 0.01042445, "balance_loss_clip": 1.05034626, "balance_loss_mlp": 1.02736473, "epoch": 0.3552683000150308, "flos": 26432264747520.0, "grad_norm": 2.926995842336842, "language_loss": 0.68243527, "learning_rate": 2.989074743819502e-06, "loss": 0.70400161, "num_input_tokens_seen": 127038585, "step": 5909, "time_per_iteration": 2.6967928409576416 }, { "auxiliary_loss_clip": 0.01120713, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.0503571, "balance_loss_mlp": 1.02271986, "epoch": 0.35532842326769876, "flos": 19785989468160.0, "grad_norm": 2.2169711344959864, "language_loss": 0.78605235, "learning_rate": 2.988736221969144e-06, "loss": 0.807634, "num_input_tokens_seen": 127056215, "step": 5910, "time_per_iteration": 2.65592885017395 }, { "auxiliary_loss_clip": 0.01111825, "auxiliary_loss_mlp": 0.01044022, "balance_loss_clip": 1.04383612, "balance_loss_mlp": 1.02745175, "epoch": 0.3553885465203668, "flos": 17239277237760.0, "grad_norm": 4.097628076705993, "language_loss": 0.71322721, "learning_rate": 2.98839766262581e-06, "loss": 0.73478568, "num_input_tokens_seen": 127075825, "step": 5911, "time_per_iteration": 2.6958134174346924 }, { "auxiliary_loss_clip": 0.01122761, "auxiliary_loss_mlp": 0.01041881, "balance_loss_clip": 1.04820287, "balance_loss_mlp": 1.02711153, "epoch": 0.35544866977303474, "flos": 14934350430720.0, "grad_norm": 2.592685980990988, "language_loss": 0.86703777, "learning_rate": 2.9880590658023366e-06, "loss": 0.88868415, "num_input_tokens_seen": 127091205, "step": 5912, "time_per_iteration": 2.615788221359253 }, { "auxiliary_loss_clip": 0.01113661, "auxiliary_loss_mlp": 0.01038659, "balance_loss_clip": 1.04849911, "balance_loss_mlp": 1.02413917, "epoch": 0.3555087930257027, "flos": 19756040503680.0, "grad_norm": 1.9602305341473392, "language_loss": 0.76948488, "learning_rate": 2.9877204315115646e-06, "loss": 0.79100811, "num_input_tokens_seen": 127109210, "step": 5913, "time_per_iteration": 2.7827799320220947 }, { "auxiliary_loss_clip": 0.01098195, "auxiliary_loss_mlp": 0.01036489, "balance_loss_clip": 1.04796672, "balance_loss_mlp": 1.02183783, "epoch": 0.3555689162783707, "flos": 21068252156160.0, "grad_norm": 1.6272917241322848, "language_loss": 0.82545209, "learning_rate": 2.9873817597663353e-06, "loss": 0.8467989, "num_input_tokens_seen": 127128400, "step": 5914, "time_per_iteration": 2.7242603302001953 }, { "auxiliary_loss_clip": 0.01137835, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.05178475, "balance_loss_mlp": 1.02247739, "epoch": 0.35562903953103864, "flos": 33069633454080.0, "grad_norm": 2.9034799926536, "language_loss": 0.70664769, "learning_rate": 2.98704305057949e-06, "loss": 0.72840279, "num_input_tokens_seen": 127149965, "step": 5915, "time_per_iteration": 2.6785290241241455 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01042738, "balance_loss_clip": 1.04884696, "balance_loss_mlp": 1.02823067, "epoch": 0.3556891627837066, "flos": 20557853850240.0, "grad_norm": 1.7433450554379117, "language_loss": 0.76387751, "learning_rate": 2.9867043039638737e-06, "loss": 0.78555447, "num_input_tokens_seen": 127169865, "step": 5916, "time_per_iteration": 2.646141529083252 }, { "auxiliary_loss_clip": 0.01103991, "auxiliary_loss_mlp": 0.01039438, "balance_loss_clip": 1.04549897, "balance_loss_mlp": 1.02451277, "epoch": 0.35574928603637457, "flos": 20703327932160.0, "grad_norm": 1.7213233773991115, "language_loss": 0.88551259, "learning_rate": 2.986365519932332e-06, "loss": 0.9069469, "num_input_tokens_seen": 127188075, "step": 5917, "time_per_iteration": 2.735424757003784 }, { "auxiliary_loss_clip": 0.01057648, "auxiliary_loss_mlp": 0.01050179, "balance_loss_clip": 1.03888357, "balance_loss_mlp": 1.03190458, "epoch": 0.35580940928904253, "flos": 15194595444480.0, "grad_norm": 2.1986231946039916, "language_loss": 0.74800515, "learning_rate": 2.98602669849771e-06, "loss": 0.76908338, "num_input_tokens_seen": 127206065, "step": 5918, "time_per_iteration": 2.759612798690796 }, { "auxiliary_loss_clip": 0.01046226, "auxiliary_loss_mlp": 0.01004318, "balance_loss_clip": 1.03416467, "balance_loss_mlp": 1.00212467, "epoch": 0.3558695325417105, "flos": 58639145431680.0, "grad_norm": 0.9523078238877629, "language_loss": 0.63871694, "learning_rate": 2.985687839672857e-06, "loss": 0.65922242, "num_input_tokens_seen": 127257885, "step": 5919, "time_per_iteration": 2.974400281906128 }, { "auxiliary_loss_clip": 0.01125949, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.05126309, "balance_loss_mlp": 1.02168, "epoch": 0.35592965579437846, "flos": 22018233104640.0, "grad_norm": 2.3466450300124952, "language_loss": 0.73515332, "learning_rate": 2.9853489434706223e-06, "loss": 0.75679016, "num_input_tokens_seen": 127275550, "step": 5920, "time_per_iteration": 2.6402368545532227 }, { "auxiliary_loss_clip": 0.01092607, "auxiliary_loss_mlp": 0.01035798, "balance_loss_clip": 1.0452888, "balance_loss_mlp": 1.02082539, "epoch": 0.35598977904704643, "flos": 23367684182400.0, "grad_norm": 2.020155019062759, "language_loss": 0.76745147, "learning_rate": 2.985010009903857e-06, "loss": 0.78873557, "num_input_tokens_seen": 127295110, "step": 5921, "time_per_iteration": 2.7224855422973633 }, { "auxiliary_loss_clip": 0.01112186, "auxiliary_loss_mlp": 0.01038012, "balance_loss_clip": 1.04887438, "balance_loss_mlp": 1.0231111, "epoch": 0.3560499022997144, "flos": 17785334770560.0, "grad_norm": 2.0978128065546717, "language_loss": 0.68095905, "learning_rate": 2.9846710389854133e-06, "loss": 0.702461, "num_input_tokens_seen": 127312865, "step": 5922, "time_per_iteration": 2.6849706172943115 }, { "auxiliary_loss_clip": 0.01120912, "auxiliary_loss_mlp": 0.01035687, "balance_loss_clip": 1.04752564, "balance_loss_mlp": 1.02032125, "epoch": 0.35611002555238236, "flos": 20740459616640.0, "grad_norm": 3.470851899346702, "language_loss": 0.79121947, "learning_rate": 2.9843320307281454e-06, "loss": 0.81278539, "num_input_tokens_seen": 127331710, "step": 5923, "time_per_iteration": 2.659977436065674 }, { "auxiliary_loss_clip": 0.01118161, "auxiliary_loss_mlp": 0.01042419, "balance_loss_clip": 1.0530231, "balance_loss_mlp": 1.02770221, "epoch": 0.3561701488050504, "flos": 19462219251840.0, "grad_norm": 2.2084385051152946, "language_loss": 0.85266459, "learning_rate": 2.983992985144908e-06, "loss": 0.87427044, "num_input_tokens_seen": 127350950, "step": 5924, "time_per_iteration": 2.680994987487793 }, { "auxiliary_loss_clip": 0.01109604, "auxiliary_loss_mlp": 0.01046078, "balance_loss_clip": 1.04669881, "balance_loss_mlp": 1.02974653, "epoch": 0.35623027205771834, "flos": 30774942023040.0, "grad_norm": 3.12021389910605, "language_loss": 0.77619767, "learning_rate": 2.9836539022485578e-06, "loss": 0.79775453, "num_input_tokens_seen": 127369385, "step": 5925, "time_per_iteration": 2.854043960571289 }, { "auxiliary_loss_clip": 0.01078608, "auxiliary_loss_mlp": 0.01047631, "balance_loss_clip": 1.04546142, "balance_loss_mlp": 1.03274155, "epoch": 0.3562903953103863, "flos": 16981079299200.0, "grad_norm": 2.0406100546628108, "language_loss": 0.75402963, "learning_rate": 2.9833147820519535e-06, "loss": 0.77529198, "num_input_tokens_seen": 127386965, "step": 5926, "time_per_iteration": 4.347430467605591 }, { "auxiliary_loss_clip": 0.01110536, "auxiliary_loss_mlp": 0.00773423, "balance_loss_clip": 1.04907203, "balance_loss_mlp": 1.00041842, "epoch": 0.3563505185630543, "flos": 23839837482240.0, "grad_norm": 2.7011184644215254, "language_loss": 0.69563019, "learning_rate": 2.9829756245679544e-06, "loss": 0.71446979, "num_input_tokens_seen": 127406075, "step": 5927, "time_per_iteration": 2.8237216472625732 }, { "auxiliary_loss_clip": 0.01136293, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05083871, "balance_loss_mlp": 1.0256958, "epoch": 0.35641064181572224, "flos": 22273450214400.0, "grad_norm": 2.594343371199836, "language_loss": 0.79681075, "learning_rate": 2.9826364298094212e-06, "loss": 0.81857955, "num_input_tokens_seen": 127425350, "step": 5928, "time_per_iteration": 4.171353340148926 }, { "auxiliary_loss_clip": 0.01139765, "auxiliary_loss_mlp": 0.01040338, "balance_loss_clip": 1.05304861, "balance_loss_mlp": 1.02473354, "epoch": 0.3564707650683902, "flos": 23001251587200.0, "grad_norm": 1.4355701611092584, "language_loss": 0.81758744, "learning_rate": 2.982297197789215e-06, "loss": 0.83938849, "num_input_tokens_seen": 127446335, "step": 5929, "time_per_iteration": 4.3162572383880615 }, { "auxiliary_loss_clip": 0.01120871, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.04776335, "balance_loss_mlp": 1.02304602, "epoch": 0.35653088832105817, "flos": 14684268965760.0, "grad_norm": 1.9323399136404307, "language_loss": 0.70277226, "learning_rate": 2.981957928520201e-06, "loss": 0.72435665, "num_input_tokens_seen": 127462795, "step": 5930, "time_per_iteration": 2.6527109146118164 }, { "auxiliary_loss_clip": 0.01131875, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.05533779, "balance_loss_mlp": 1.02960742, "epoch": 0.35659101157372614, "flos": 23477068074240.0, "grad_norm": 2.2535070260025147, "language_loss": 0.6758765, "learning_rate": 2.981618622015244e-06, "loss": 0.69765162, "num_input_tokens_seen": 127482675, "step": 5931, "time_per_iteration": 4.3453147411346436 }, { "auxiliary_loss_clip": 0.0112554, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.04992425, "balance_loss_mlp": 1.02531803, "epoch": 0.3566511348263941, "flos": 26578672583040.0, "grad_norm": 1.9436277425022137, "language_loss": 0.67792088, "learning_rate": 2.981279278287211e-06, "loss": 0.69957745, "num_input_tokens_seen": 127502275, "step": 5932, "time_per_iteration": 2.700096368789673 }, { "auxiliary_loss_clip": 0.01082532, "auxiliary_loss_mlp": 0.01033095, "balance_loss_clip": 1.04578543, "balance_loss_mlp": 1.01849222, "epoch": 0.35671125807906207, "flos": 13115008609920.0, "grad_norm": 5.160615382495107, "language_loss": 0.78454852, "learning_rate": 2.980939897348969e-06, "loss": 0.80570471, "num_input_tokens_seen": 127520195, "step": 5933, "time_per_iteration": 2.6900391578674316 }, { "auxiliary_loss_clip": 0.01121777, "auxiliary_loss_mlp": 0.01052933, "balance_loss_clip": 1.0480361, "balance_loss_mlp": 1.03600574, "epoch": 0.35677138133173003, "flos": 33000577557120.0, "grad_norm": 1.6861574442761758, "language_loss": 0.69256425, "learning_rate": 2.980600479213388e-06, "loss": 0.7143113, "num_input_tokens_seen": 127544495, "step": 5934, "time_per_iteration": 2.7415738105773926 }, { "auxiliary_loss_clip": 0.01117054, "auxiliary_loss_mlp": 0.0077763, "balance_loss_clip": 1.05076528, "balance_loss_mlp": 1.00057197, "epoch": 0.356831504584398, "flos": 20777842696320.0, "grad_norm": 1.9577931058258786, "language_loss": 0.70848507, "learning_rate": 2.9802610238933384e-06, "loss": 0.72743189, "num_input_tokens_seen": 127563810, "step": 5935, "time_per_iteration": 2.689974069595337 }, { "auxiliary_loss_clip": 0.01105553, "auxiliary_loss_mlp": 0.01040367, "balance_loss_clip": 1.04790044, "balance_loss_mlp": 1.02414298, "epoch": 0.35689162783706596, "flos": 12165566365440.0, "grad_norm": 2.8406009493899567, "language_loss": 0.7755211, "learning_rate": 2.979921531401692e-06, "loss": 0.79698032, "num_input_tokens_seen": 127579065, "step": 5936, "time_per_iteration": 2.741913318634033 }, { "auxiliary_loss_clip": 0.0112859, "auxiliary_loss_mlp": 0.00773213, "balance_loss_clip": 1.05281317, "balance_loss_mlp": 1.00073922, "epoch": 0.356951751089734, "flos": 23841489507840.0, "grad_norm": 1.4219917851433757, "language_loss": 0.64282179, "learning_rate": 2.9795820017513242e-06, "loss": 0.66183978, "num_input_tokens_seen": 127599105, "step": 5937, "time_per_iteration": 2.698432207107544 }, { "auxiliary_loss_clip": 0.011437, "auxiliary_loss_mlp": 0.00773044, "balance_loss_clip": 1.05475211, "balance_loss_mlp": 1.00064254, "epoch": 0.35701187434240195, "flos": 11722176881280.0, "grad_norm": 3.0634993604384744, "language_loss": 0.78483748, "learning_rate": 2.9792424349551073e-06, "loss": 0.80400497, "num_input_tokens_seen": 127614940, "step": 5938, "time_per_iteration": 2.617074489593506 }, { "auxiliary_loss_clip": 0.01104152, "auxiliary_loss_mlp": 0.01042471, "balance_loss_clip": 1.05522823, "balance_loss_mlp": 1.0276773, "epoch": 0.3570719975950699, "flos": 24898879100160.0, "grad_norm": 1.4921508018011957, "language_loss": 0.8058449, "learning_rate": 2.9789028310259202e-06, "loss": 0.82731104, "num_input_tokens_seen": 127634960, "step": 5939, "time_per_iteration": 2.805285930633545 }, { "auxiliary_loss_clip": 0.01119857, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.05386829, "balance_loss_mlp": 1.02343178, "epoch": 0.3571321208477379, "flos": 25994836920960.0, "grad_norm": 2.412769849050775, "language_loss": 0.79263425, "learning_rate": 2.9785631899766395e-06, "loss": 0.81422341, "num_input_tokens_seen": 127654545, "step": 5940, "time_per_iteration": 2.729759693145752 }, { "auxiliary_loss_clip": 0.01122797, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.05434561, "balance_loss_mlp": 1.01836729, "epoch": 0.35719224410040584, "flos": 14501663199360.0, "grad_norm": 2.99992676537861, "language_loss": 0.72561693, "learning_rate": 2.9782235118201443e-06, "loss": 0.74720228, "num_input_tokens_seen": 127672320, "step": 5941, "time_per_iteration": 2.7407357692718506 }, { "auxiliary_loss_clip": 0.01131761, "auxiliary_loss_mlp": 0.01043456, "balance_loss_clip": 1.0537883, "balance_loss_mlp": 1.02636182, "epoch": 0.3572523673530738, "flos": 31175453646720.0, "grad_norm": 4.524453853263744, "language_loss": 0.64234614, "learning_rate": 2.9778837965693154e-06, "loss": 0.66409832, "num_input_tokens_seen": 127693315, "step": 5942, "time_per_iteration": 2.693835735321045 }, { "auxiliary_loss_clip": 0.01125006, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.05074191, "balance_loss_mlp": 1.02442718, "epoch": 0.3573124906057418, "flos": 15851976203520.0, "grad_norm": 1.88999720959261, "language_loss": 0.7433207, "learning_rate": 2.9775440442370354e-06, "loss": 0.76497656, "num_input_tokens_seen": 127711570, "step": 5943, "time_per_iteration": 2.6655383110046387 }, { "auxiliary_loss_clip": 0.0107084, "auxiliary_loss_mlp": 0.01002098, "balance_loss_clip": 1.04128122, "balance_loss_mlp": 1.000512, "epoch": 0.35737261385840974, "flos": 60822729118080.0, "grad_norm": 0.7930578325967097, "language_loss": 0.60739905, "learning_rate": 2.9772042548361867e-06, "loss": 0.62812841, "num_input_tokens_seen": 127772475, "step": 5944, "time_per_iteration": 3.257052421569824 }, { "auxiliary_loss_clip": 0.01113544, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.05017304, "balance_loss_mlp": 1.02329779, "epoch": 0.3574327371110777, "flos": 18843765857280.0, "grad_norm": 2.0176419730945554, "language_loss": 0.72310007, "learning_rate": 2.976864428379655e-06, "loss": 0.74462366, "num_input_tokens_seen": 127790940, "step": 5945, "time_per_iteration": 2.6320457458496094 }, { "auxiliary_loss_clip": 0.01113199, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.04710388, "balance_loss_mlp": 1.00053716, "epoch": 0.35749286036374567, "flos": 23549679417600.0, "grad_norm": 2.1873404124300655, "language_loss": 0.81147355, "learning_rate": 2.976524564880326e-06, "loss": 0.83034003, "num_input_tokens_seen": 127808275, "step": 5946, "time_per_iteration": 2.7045581340789795 }, { "auxiliary_loss_clip": 0.01142015, "auxiliary_loss_mlp": 0.01041839, "balance_loss_clip": 1.05382085, "balance_loss_mlp": 1.02568626, "epoch": 0.35755298361641363, "flos": 21105491581440.0, "grad_norm": 1.5286248167474699, "language_loss": 0.68842459, "learning_rate": 2.9761846643510882e-06, "loss": 0.71026313, "num_input_tokens_seen": 127828840, "step": 5947, "time_per_iteration": 2.6360325813293457 }, { "auxiliary_loss_clip": 0.01107164, "auxiliary_loss_mlp": 0.01039633, "balance_loss_clip": 1.04598188, "balance_loss_mlp": 1.02426696, "epoch": 0.3576131068690816, "flos": 19245031666560.0, "grad_norm": 4.061535671212192, "language_loss": 0.76024956, "learning_rate": 2.9758447268048297e-06, "loss": 0.78171754, "num_input_tokens_seen": 127846240, "step": 5948, "time_per_iteration": 2.6968884468078613 }, { "auxiliary_loss_clip": 0.01081903, "auxiliary_loss_mlp": 0.01043894, "balance_loss_clip": 1.04692364, "balance_loss_mlp": 1.0291121, "epoch": 0.35767323012174956, "flos": 28654703971200.0, "grad_norm": 1.8353415788349725, "language_loss": 0.70553362, "learning_rate": 2.9755047522544415e-06, "loss": 0.72679162, "num_input_tokens_seen": 127866880, "step": 5949, "time_per_iteration": 2.8849079608917236 }, { "auxiliary_loss_clip": 0.01113321, "auxiliary_loss_mlp": 0.01041031, "balance_loss_clip": 1.04892492, "balance_loss_mlp": 1.02688098, "epoch": 0.35773335337441753, "flos": 17085363459840.0, "grad_norm": 2.820547719587591, "language_loss": 0.77489066, "learning_rate": 2.9751647407128154e-06, "loss": 0.79643422, "num_input_tokens_seen": 127883560, "step": 5950, "time_per_iteration": 2.6595206260681152 }, { "auxiliary_loss_clip": 0.0112732, "auxiliary_loss_mlp": 0.01041981, "balance_loss_clip": 1.04834211, "balance_loss_mlp": 1.02592397, "epoch": 0.35779347662708555, "flos": 15888605097600.0, "grad_norm": 1.7233867228761917, "language_loss": 0.72746027, "learning_rate": 2.9748246921928445e-06, "loss": 0.74915326, "num_input_tokens_seen": 127902330, "step": 5951, "time_per_iteration": 2.6544554233551025 }, { "auxiliary_loss_clip": 0.01129333, "auxiliary_loss_mlp": 0.01041471, "balance_loss_clip": 1.05047357, "balance_loss_mlp": 1.0256753, "epoch": 0.3578535998797535, "flos": 28658834035200.0, "grad_norm": 2.2344429074284693, "language_loss": 0.69326741, "learning_rate": 2.9744846067074236e-06, "loss": 0.71497542, "num_input_tokens_seen": 127922325, "step": 5952, "time_per_iteration": 2.7666146755218506 }, { "auxiliary_loss_clip": 0.01080716, "auxiliary_loss_mlp": 0.01049645, "balance_loss_clip": 1.04122877, "balance_loss_mlp": 1.03411233, "epoch": 0.3579137231324215, "flos": 37852432076160.0, "grad_norm": 4.791743787800428, "language_loss": 0.69651616, "learning_rate": 2.974144484269449e-06, "loss": 0.71781975, "num_input_tokens_seen": 127942635, "step": 5953, "time_per_iteration": 2.900196075439453 }, { "auxiliary_loss_clip": 0.01113192, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.0476222, "balance_loss_mlp": 1.0198822, "epoch": 0.35797384638508944, "flos": 22346851656960.0, "grad_norm": 2.3015234956442394, "language_loss": 0.6670965, "learning_rate": 2.9738043248918175e-06, "loss": 0.68857497, "num_input_tokens_seen": 127962520, "step": 5954, "time_per_iteration": 2.7609100341796875 }, { "auxiliary_loss_clip": 0.011102, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.04845512, "balance_loss_mlp": 1.02633798, "epoch": 0.3580339696377574, "flos": 13589711775360.0, "grad_norm": 1.9332002852280215, "language_loss": 0.74798024, "learning_rate": 2.9734641285874282e-06, "loss": 0.76948655, "num_input_tokens_seen": 127981180, "step": 5955, "time_per_iteration": 2.727787733078003 }, { "auxiliary_loss_clip": 0.01114534, "auxiliary_loss_mlp": 0.01039755, "balance_loss_clip": 1.04827058, "balance_loss_mlp": 1.02546179, "epoch": 0.3580940928904254, "flos": 23768231719680.0, "grad_norm": 1.745052650810224, "language_loss": 0.75871193, "learning_rate": 2.973123895369182e-06, "loss": 0.78025484, "num_input_tokens_seen": 127999725, "step": 5956, "time_per_iteration": 2.685006856918335 }, { "auxiliary_loss_clip": 0.01133387, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.05088747, "balance_loss_mlp": 1.0211376, "epoch": 0.35815421614309334, "flos": 19463871277440.0, "grad_norm": 4.15447674959345, "language_loss": 0.73543882, "learning_rate": 2.9727836252499805e-06, "loss": 0.75712276, "num_input_tokens_seen": 128018885, "step": 5957, "time_per_iteration": 2.6640098094940186 }, { "auxiliary_loss_clip": 0.01113163, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.04958355, "balance_loss_mlp": 1.02395511, "epoch": 0.3582143393957613, "flos": 23368186972800.0, "grad_norm": 3.3283201757671037, "language_loss": 0.70960939, "learning_rate": 2.972443318242726e-06, "loss": 0.73112065, "num_input_tokens_seen": 128037875, "step": 5958, "time_per_iteration": 2.6962838172912598 }, { "auxiliary_loss_clip": 0.01093969, "auxiliary_loss_mlp": 0.01038485, "balance_loss_clip": 1.04454029, "balance_loss_mlp": 1.02435875, "epoch": 0.35827446264842927, "flos": 26323275905280.0, "grad_norm": 2.5438119471533494, "language_loss": 0.88630176, "learning_rate": 2.972102974360324e-06, "loss": 0.90762633, "num_input_tokens_seen": 128056045, "step": 5959, "time_per_iteration": 2.713508129119873 }, { "auxiliary_loss_clip": 0.0113447, "auxiliary_loss_mlp": 0.010399, "balance_loss_clip": 1.05009389, "balance_loss_mlp": 1.02511787, "epoch": 0.35833458590109724, "flos": 30446610779520.0, "grad_norm": 2.2010810744211486, "language_loss": 0.58033586, "learning_rate": 2.971762593615679e-06, "loss": 0.60207957, "num_input_tokens_seen": 128077815, "step": 5960, "time_per_iteration": 2.685009479522705 }, { "auxiliary_loss_clip": 0.0113445, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.04900908, "balance_loss_mlp": 1.0255897, "epoch": 0.3583947091537652, "flos": 14829886702080.0, "grad_norm": 2.9088839798225035, "language_loss": 0.75860739, "learning_rate": 2.9714221760216993e-06, "loss": 0.7803694, "num_input_tokens_seen": 128095460, "step": 5961, "time_per_iteration": 2.591665506362915 }, { "auxiliary_loss_clip": 0.01103629, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.04985154, "balance_loss_mlp": 1.022223, "epoch": 0.35845483240643317, "flos": 34240644743040.0, "grad_norm": 1.7962139278871543, "language_loss": 0.70392656, "learning_rate": 2.971081721591294e-06, "loss": 0.72533739, "num_input_tokens_seen": 128118605, "step": 5962, "time_per_iteration": 2.78696346282959 }, { "auxiliary_loss_clip": 0.01116632, "auxiliary_loss_mlp": 0.01038106, "balance_loss_clip": 1.0513072, "balance_loss_mlp": 1.02532077, "epoch": 0.35851495565910113, "flos": 20960089326720.0, "grad_norm": 3.937600501619356, "language_loss": 0.75052911, "learning_rate": 2.9707412303373716e-06, "loss": 0.77207649, "num_input_tokens_seen": 128139205, "step": 5963, "time_per_iteration": 2.779210090637207 }, { "auxiliary_loss_clip": 0.01136067, "auxiliary_loss_mlp": 0.01044967, "balance_loss_clip": 1.05189323, "balance_loss_mlp": 1.03017306, "epoch": 0.35857507891176915, "flos": 22309863626880.0, "grad_norm": 3.7087256254692305, "language_loss": 0.78717148, "learning_rate": 2.9704007022728447e-06, "loss": 0.80898178, "num_input_tokens_seen": 128158765, "step": 5964, "time_per_iteration": 2.598621368408203 }, { "auxiliary_loss_clip": 0.01112011, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.05019569, "balance_loss_mlp": 1.02534723, "epoch": 0.3586352021644371, "flos": 23367863750400.0, "grad_norm": 2.0226045347569857, "language_loss": 0.66572571, "learning_rate": 2.970060137410626e-06, "loss": 0.6872592, "num_input_tokens_seen": 128177850, "step": 5965, "time_per_iteration": 2.684847116470337 }, { "auxiliary_loss_clip": 0.01132652, "auxiliary_loss_mlp": 0.0077213, "balance_loss_clip": 1.04819942, "balance_loss_mlp": 1.00052619, "epoch": 0.3586953254171051, "flos": 27849227437440.0, "grad_norm": 2.180178648475794, "language_loss": 0.79150963, "learning_rate": 2.9697195357636294e-06, "loss": 0.81055743, "num_input_tokens_seen": 128196925, "step": 5966, "time_per_iteration": 4.321925163269043 }, { "auxiliary_loss_clip": 0.01076497, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.04272628, "balance_loss_mlp": 1.02573991, "epoch": 0.35875544866977305, "flos": 19500500171520.0, "grad_norm": 2.3639555115609663, "language_loss": 0.91201752, "learning_rate": 2.9693788973447715e-06, "loss": 0.93320298, "num_input_tokens_seen": 128213955, "step": 5967, "time_per_iteration": 2.7455573081970215 }, { "auxiliary_loss_clip": 0.01101026, "auxiliary_loss_mlp": 0.01053293, "balance_loss_clip": 1.04794097, "balance_loss_mlp": 1.03494644, "epoch": 0.358815571922441, "flos": 21471134077440.0, "grad_norm": 5.4514250686274695, "language_loss": 0.80356693, "learning_rate": 2.9690382221669682e-06, "loss": 0.82511014, "num_input_tokens_seen": 128232980, "step": 5968, "time_per_iteration": 4.176758766174316 }, { "auxiliary_loss_clip": 0.01109306, "auxiliary_loss_mlp": 0.01052187, "balance_loss_clip": 1.04507756, "balance_loss_mlp": 1.03602266, "epoch": 0.358875695175109, "flos": 21835411856640.0, "grad_norm": 2.18425096992674, "language_loss": 0.8341769, "learning_rate": 2.9686975102431384e-06, "loss": 0.85579193, "num_input_tokens_seen": 128252795, "step": 5969, "time_per_iteration": 4.278231382369995 }, { "auxiliary_loss_clip": 0.01089525, "auxiliary_loss_mlp": 0.01034474, "balance_loss_clip": 1.04389262, "balance_loss_mlp": 1.0201571, "epoch": 0.35893581842777694, "flos": 32011633330560.0, "grad_norm": 2.040075228447558, "language_loss": 0.72608048, "learning_rate": 2.968356761586202e-06, "loss": 0.74732047, "num_input_tokens_seen": 128273115, "step": 5970, "time_per_iteration": 2.7784154415130615 }, { "auxiliary_loss_clip": 0.01110616, "auxiliary_loss_mlp": 0.01033542, "balance_loss_clip": 1.04673791, "balance_loss_mlp": 1.01868832, "epoch": 0.3589959416804449, "flos": 20485817124480.0, "grad_norm": 1.7975318028216438, "language_loss": 0.79562962, "learning_rate": 2.9680159762090805e-06, "loss": 0.8170712, "num_input_tokens_seen": 128292220, "step": 5971, "time_per_iteration": 4.519066333770752 }, { "auxiliary_loss_clip": 0.01098267, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.04956031, "balance_loss_mlp": 1.02766144, "epoch": 0.3590560649331129, "flos": 16180666583040.0, "grad_norm": 1.754965992567408, "language_loss": 0.78217793, "learning_rate": 2.967675154124696e-06, "loss": 0.80359125, "num_input_tokens_seen": 128310305, "step": 5972, "time_per_iteration": 2.7724227905273438 }, { "auxiliary_loss_clip": 0.01092509, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.04198921, "balance_loss_mlp": 1.02043509, "epoch": 0.35911618818578084, "flos": 20375391738240.0, "grad_norm": 2.4812117519320287, "language_loss": 0.8120966, "learning_rate": 2.9673342953459722e-06, "loss": 0.83337677, "num_input_tokens_seen": 128328305, "step": 5973, "time_per_iteration": 2.8266379833221436 }, { "auxiliary_loss_clip": 0.01042329, "auxiliary_loss_mlp": 0.01005341, "balance_loss_clip": 1.03088689, "balance_loss_mlp": 1.0036602, "epoch": 0.3591763114384488, "flos": 41236691685120.0, "grad_norm": 0.9056618080123127, "language_loss": 0.56743383, "learning_rate": 2.9669933998858355e-06, "loss": 0.58791053, "num_input_tokens_seen": 128378380, "step": 5974, "time_per_iteration": 3.0758044719696045 }, { "auxiliary_loss_clip": 0.01126274, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.04946434, "balance_loss_mlp": 1.02339661, "epoch": 0.35923643469111677, "flos": 18695454600960.0, "grad_norm": 2.5569125412900022, "language_loss": 0.68787563, "learning_rate": 2.9666524677572114e-06, "loss": 0.70951241, "num_input_tokens_seen": 128394315, "step": 5975, "time_per_iteration": 2.657576084136963 }, { "auxiliary_loss_clip": 0.01134392, "auxiliary_loss_mlp": 0.01038612, "balance_loss_clip": 1.04914975, "balance_loss_mlp": 1.02426553, "epoch": 0.35929655794378473, "flos": 25009950931200.0, "grad_norm": 1.804443520579843, "language_loss": 0.79982442, "learning_rate": 2.96631149897303e-06, "loss": 0.82155442, "num_input_tokens_seen": 128414515, "step": 5976, "time_per_iteration": 2.6197311878204346 }, { "auxiliary_loss_clip": 0.01074524, "auxiliary_loss_mlp": 0.01040105, "balance_loss_clip": 1.04337287, "balance_loss_mlp": 1.02404785, "epoch": 0.35935668119645275, "flos": 14975576265600.0, "grad_norm": 1.9714674470262432, "language_loss": 0.78818405, "learning_rate": 2.9659704935462194e-06, "loss": 0.8093304, "num_input_tokens_seen": 128430615, "step": 5977, "time_per_iteration": 2.735844612121582 }, { "auxiliary_loss_clip": 0.01094647, "auxiliary_loss_mlp": 0.01041851, "balance_loss_clip": 1.04511654, "balance_loss_mlp": 1.02789736, "epoch": 0.3594168044491207, "flos": 21178138838400.0, "grad_norm": 2.560014574379112, "language_loss": 0.79859221, "learning_rate": 2.9656294514897102e-06, "loss": 0.8199572, "num_input_tokens_seen": 128449480, "step": 5978, "time_per_iteration": 2.704134941101074 }, { "auxiliary_loss_clip": 0.01135434, "auxiliary_loss_mlp": 0.00773692, "balance_loss_clip": 1.04890609, "balance_loss_mlp": 1.00073409, "epoch": 0.3594769277017887, "flos": 27672152365440.0, "grad_norm": 4.868201977342703, "language_loss": 0.68310702, "learning_rate": 2.965288372816436e-06, "loss": 0.70219827, "num_input_tokens_seen": 128471465, "step": 5979, "time_per_iteration": 2.667222499847412 }, { "auxiliary_loss_clip": 0.01105596, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.04548645, "balance_loss_mlp": 1.01876652, "epoch": 0.35953705095445665, "flos": 23002328995200.0, "grad_norm": 6.298210491387724, "language_loss": 0.67445302, "learning_rate": 2.9649472575393296e-06, "loss": 0.69584739, "num_input_tokens_seen": 128490645, "step": 5980, "time_per_iteration": 2.6262974739074707 }, { "auxiliary_loss_clip": 0.01113802, "auxiliary_loss_mlp": 0.01040029, "balance_loss_clip": 1.04725266, "balance_loss_mlp": 1.02324414, "epoch": 0.3595971742071246, "flos": 25513992529920.0, "grad_norm": 1.8251567017824133, "language_loss": 0.71328801, "learning_rate": 2.964606105671327e-06, "loss": 0.73482633, "num_input_tokens_seen": 128510225, "step": 5981, "time_per_iteration": 2.696676254272461 }, { "auxiliary_loss_clip": 0.01109039, "auxiliary_loss_mlp": 0.01041685, "balance_loss_clip": 1.04872131, "balance_loss_mlp": 1.02498353, "epoch": 0.3596572974597926, "flos": 29862559635840.0, "grad_norm": 2.0089481436352767, "language_loss": 0.71294796, "learning_rate": 2.9642649172253635e-06, "loss": 0.73445523, "num_input_tokens_seen": 128530195, "step": 5982, "time_per_iteration": 2.7264244556427 }, { "auxiliary_loss_clip": 0.01114107, "auxiliary_loss_mlp": 0.01046667, "balance_loss_clip": 1.04542398, "balance_loss_mlp": 1.03115773, "epoch": 0.35971742071246054, "flos": 23112538899840.0, "grad_norm": 1.8520970942870048, "language_loss": 0.75614822, "learning_rate": 2.9639236922143786e-06, "loss": 0.77775598, "num_input_tokens_seen": 128549990, "step": 5983, "time_per_iteration": 2.6827449798583984 }, { "auxiliary_loss_clip": 0.01140239, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.0510025, "balance_loss_mlp": 1.02626991, "epoch": 0.3597775439651285, "flos": 16725359399040.0, "grad_norm": 17.088734777986428, "language_loss": 0.76256114, "learning_rate": 2.96358243065131e-06, "loss": 0.78439057, "num_input_tokens_seen": 128567925, "step": 5984, "time_per_iteration": 2.695389747619629 }, { "auxiliary_loss_clip": 0.01117847, "auxiliary_loss_mlp": 0.00772256, "balance_loss_clip": 1.04583967, "balance_loss_mlp": 1.00047541, "epoch": 0.3598376672177965, "flos": 19719483436800.0, "grad_norm": 1.8513392555770956, "language_loss": 0.86111921, "learning_rate": 2.9632411325490993e-06, "loss": 0.88002026, "num_input_tokens_seen": 128585655, "step": 5985, "time_per_iteration": 2.6440985202789307 }, { "auxiliary_loss_clip": 0.01117958, "auxiliary_loss_mlp": 0.01045892, "balance_loss_clip": 1.04564977, "balance_loss_mlp": 1.03012037, "epoch": 0.35989779047046444, "flos": 17311529445120.0, "grad_norm": 2.5721307867834406, "language_loss": 0.72770452, "learning_rate": 2.9628997979206884e-06, "loss": 0.74934304, "num_input_tokens_seen": 128604820, "step": 5986, "time_per_iteration": 2.6169698238372803 }, { "auxiliary_loss_clip": 0.01100506, "auxiliary_loss_mlp": 0.01039862, "balance_loss_clip": 1.04264784, "balance_loss_mlp": 1.02473474, "epoch": 0.3599579137231324, "flos": 22711237176960.0, "grad_norm": 2.1943162754876497, "language_loss": 0.73883474, "learning_rate": 2.9625584267790204e-06, "loss": 0.76023847, "num_input_tokens_seen": 128623070, "step": 5987, "time_per_iteration": 2.72385573387146 }, { "auxiliary_loss_clip": 0.0114047, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.05135727, "balance_loss_mlp": 1.02456188, "epoch": 0.36001803697580037, "flos": 20959873845120.0, "grad_norm": 2.225645474388546, "language_loss": 0.69665354, "learning_rate": 2.9622170191370404e-06, "loss": 0.71846086, "num_input_tokens_seen": 128642430, "step": 5988, "time_per_iteration": 2.6040101051330566 }, { "auxiliary_loss_clip": 0.01127132, "auxiliary_loss_mlp": 0.01043358, "balance_loss_clip": 1.04819822, "balance_loss_mlp": 1.0278132, "epoch": 0.36007816022846834, "flos": 20485565729280.0, "grad_norm": 2.281223653114012, "language_loss": 0.73300481, "learning_rate": 2.9618755750076953e-06, "loss": 0.75470972, "num_input_tokens_seen": 128661285, "step": 5989, "time_per_iteration": 2.6532981395721436 }, { "auxiliary_loss_clip": 0.01089891, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.04161119, "balance_loss_mlp": 1.02237916, "epoch": 0.36013828348113636, "flos": 28001237794560.0, "grad_norm": 3.1935134184936156, "language_loss": 0.79950285, "learning_rate": 2.961534094403931e-06, "loss": 0.82077992, "num_input_tokens_seen": 128682210, "step": 5990, "time_per_iteration": 2.785142421722412 }, { "auxiliary_loss_clip": 0.01123339, "auxiliary_loss_mlp": 0.0103344, "balance_loss_clip": 1.04714704, "balance_loss_mlp": 1.01775789, "epoch": 0.3601984067338043, "flos": 20082181017600.0, "grad_norm": 2.506195073342272, "language_loss": 0.83875644, "learning_rate": 2.961192577338698e-06, "loss": 0.86032414, "num_input_tokens_seen": 128700445, "step": 5991, "time_per_iteration": 2.6310808658599854 }, { "auxiliary_loss_clip": 0.01111044, "auxiliary_loss_mlp": 0.01045829, "balance_loss_clip": 1.04896092, "balance_loss_mlp": 1.03068912, "epoch": 0.3602585299864723, "flos": 18617599872000.0, "grad_norm": 2.314320245159203, "language_loss": 0.75628942, "learning_rate": 2.9608510238249463e-06, "loss": 0.77785814, "num_input_tokens_seen": 128716855, "step": 5992, "time_per_iteration": 2.6698272228240967 }, { "auxiliary_loss_clip": 0.01134951, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.04993188, "balance_loss_mlp": 1.02385557, "epoch": 0.36031865323914025, "flos": 19573003774080.0, "grad_norm": 2.1820524355734072, "language_loss": 0.76886415, "learning_rate": 2.960509433875627e-06, "loss": 0.79060775, "num_input_tokens_seen": 128735835, "step": 5993, "time_per_iteration": 2.5999341011047363 }, { "auxiliary_loss_clip": 0.01111748, "auxiliary_loss_mlp": 0.01054388, "balance_loss_clip": 1.04750419, "balance_loss_mlp": 1.03762674, "epoch": 0.3603787764918082, "flos": 17490615678720.0, "grad_norm": 1.8546706349055275, "language_loss": 0.74672681, "learning_rate": 2.9601678075036943e-06, "loss": 0.76838815, "num_input_tokens_seen": 128752465, "step": 5994, "time_per_iteration": 2.6691155433654785 }, { "auxiliary_loss_clip": 0.01095118, "auxiliary_loss_mlp": 0.01038312, "balance_loss_clip": 1.0480628, "balance_loss_mlp": 1.02331567, "epoch": 0.3604388997444762, "flos": 15523393564800.0, "grad_norm": 2.7696142346579666, "language_loss": 0.68887782, "learning_rate": 2.9598261447221024e-06, "loss": 0.71021217, "num_input_tokens_seen": 128770865, "step": 5995, "time_per_iteration": 2.7497267723083496 }, { "auxiliary_loss_clip": 0.01104395, "auxiliary_loss_mlp": 0.01046311, "balance_loss_clip": 1.04338932, "balance_loss_mlp": 1.03031349, "epoch": 0.36049902299714415, "flos": 17310883000320.0, "grad_norm": 2.2305093143222248, "language_loss": 0.82564914, "learning_rate": 2.9594844455438057e-06, "loss": 0.84715617, "num_input_tokens_seen": 128789730, "step": 5996, "time_per_iteration": 2.7227983474731445 }, { "auxiliary_loss_clip": 0.01135369, "auxiliary_loss_mlp": 0.0103828, "balance_loss_clip": 1.04974842, "balance_loss_mlp": 1.02300954, "epoch": 0.3605591462498121, "flos": 17056025026560.0, "grad_norm": 2.068995609090248, "language_loss": 0.73795009, "learning_rate": 2.959142709981763e-06, "loss": 0.75968659, "num_input_tokens_seen": 128806610, "step": 5997, "time_per_iteration": 2.572842836380005 }, { "auxiliary_loss_clip": 0.01121916, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.0482775, "balance_loss_mlp": 1.0226686, "epoch": 0.3606192695024801, "flos": 16836862193280.0, "grad_norm": 2.7116535757300215, "language_loss": 0.69209671, "learning_rate": 2.9588009380489337e-06, "loss": 0.71368217, "num_input_tokens_seen": 128824830, "step": 5998, "time_per_iteration": 2.604459047317505 }, { "auxiliary_loss_clip": 0.01085406, "auxiliary_loss_mlp": 0.01041904, "balance_loss_clip": 1.04395008, "balance_loss_mlp": 1.02565587, "epoch": 0.36067939275514804, "flos": 12129655743360.0, "grad_norm": 2.6293691676304745, "language_loss": 0.76580822, "learning_rate": 2.9584591297582758e-06, "loss": 0.78708136, "num_input_tokens_seen": 128838170, "step": 5999, "time_per_iteration": 2.6671667098999023 }, { "auxiliary_loss_clip": 0.01098137, "auxiliary_loss_mlp": 0.01040783, "balance_loss_clip": 1.04674315, "balance_loss_mlp": 1.02590609, "epoch": 0.360739516007816, "flos": 18041449720320.0, "grad_norm": 1.8157116334206203, "language_loss": 0.78264523, "learning_rate": 2.9581172851227516e-06, "loss": 0.80403441, "num_input_tokens_seen": 128855625, "step": 6000, "time_per_iteration": 2.743117332458496 }, { "auxiliary_loss_clip": 0.01095162, "auxiliary_loss_mlp": 0.01036289, "balance_loss_clip": 1.04705954, "balance_loss_mlp": 1.02203155, "epoch": 0.360799639260484, "flos": 18549800951040.0, "grad_norm": 1.8701006971713747, "language_loss": 0.78316295, "learning_rate": 2.9577754041553243e-06, "loss": 0.80447751, "num_input_tokens_seen": 128873540, "step": 6001, "time_per_iteration": 2.7342417240142822 }, { "auxiliary_loss_clip": 0.01130356, "auxiliary_loss_mlp": 0.0077146, "balance_loss_clip": 1.04727733, "balance_loss_mlp": 1.00072694, "epoch": 0.36085976251315194, "flos": 19682028529920.0, "grad_norm": 3.3927220139250056, "language_loss": 0.83151853, "learning_rate": 2.9574334868689575e-06, "loss": 0.8505367, "num_input_tokens_seen": 128889925, "step": 6002, "time_per_iteration": 2.6884238719940186 }, { "auxiliary_loss_clip": 0.01101804, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.04249346, "balance_loss_mlp": 1.02011156, "epoch": 0.3609198857658199, "flos": 24198943703040.0, "grad_norm": 2.135208430409031, "language_loss": 0.90677911, "learning_rate": 2.9570915332766165e-06, "loss": 0.92812997, "num_input_tokens_seen": 128906890, "step": 6003, "time_per_iteration": 2.666738986968994 }, { "auxiliary_loss_clip": 0.01036783, "auxiliary_loss_mlp": 0.0101378, "balance_loss_clip": 1.03707922, "balance_loss_mlp": 1.01194429, "epoch": 0.3609800090184879, "flos": 57115995160320.0, "grad_norm": 0.8844533830179444, "language_loss": 0.53396428, "learning_rate": 2.9567495433912693e-06, "loss": 0.55446988, "num_input_tokens_seen": 128965940, "step": 6004, "time_per_iteration": 3.1421444416046143 }, { "auxiliary_loss_clip": 0.01112391, "auxiliary_loss_mlp": 0.00772771, "balance_loss_clip": 1.04665363, "balance_loss_mlp": 1.00050342, "epoch": 0.3610401322711559, "flos": 20811239366400.0, "grad_norm": 2.085214899207264, "language_loss": 0.77743608, "learning_rate": 2.956407517225883e-06, "loss": 0.79628766, "num_input_tokens_seen": 128985835, "step": 6005, "time_per_iteration": 4.196998596191406 }, { "auxiliary_loss_clip": 0.01114373, "auxiliary_loss_mlp": 0.01043264, "balance_loss_clip": 1.04545391, "balance_loss_mlp": 1.02866125, "epoch": 0.36110025552382385, "flos": 13699167494400.0, "grad_norm": 1.984756598411705, "language_loss": 0.78795588, "learning_rate": 2.956065454793429e-06, "loss": 0.80953228, "num_input_tokens_seen": 129003120, "step": 6006, "time_per_iteration": 2.642446517944336 }, { "auxiliary_loss_clip": 0.01135515, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.04913247, "balance_loss_mlp": 1.02116823, "epoch": 0.3611603787764918, "flos": 22455014486400.0, "grad_norm": 3.6522767524231248, "language_loss": 0.84766537, "learning_rate": 2.955723356106876e-06, "loss": 0.86939454, "num_input_tokens_seen": 129021645, "step": 6007, "time_per_iteration": 4.38408637046814 }, { "auxiliary_loss_clip": 0.01120706, "auxiliary_loss_mlp": 0.01035853, "balance_loss_clip": 1.05059266, "balance_loss_mlp": 1.01940203, "epoch": 0.3612205020291598, "flos": 20886651970560.0, "grad_norm": 2.20663208121776, "language_loss": 0.72179425, "learning_rate": 2.955381221179198e-06, "loss": 0.7433598, "num_input_tokens_seen": 129038375, "step": 6008, "time_per_iteration": 4.262283802032471 }, { "auxiliary_loss_clip": 0.01118211, "auxiliary_loss_mlp": 0.0103587, "balance_loss_clip": 1.04345882, "balance_loss_mlp": 1.02150559, "epoch": 0.36128062528182775, "flos": 15741981780480.0, "grad_norm": 7.815944525258205, "language_loss": 0.83056295, "learning_rate": 2.955039050023368e-06, "loss": 0.85210377, "num_input_tokens_seen": 129056235, "step": 6009, "time_per_iteration": 2.643824577331543 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.01045676, "balance_loss_clip": 1.04862237, "balance_loss_mlp": 1.03013086, "epoch": 0.3613407485344957, "flos": 16764502245120.0, "grad_norm": 2.1132167438001166, "language_loss": 0.7616573, "learning_rate": 2.954696842652362e-06, "loss": 0.7831707, "num_input_tokens_seen": 129072405, "step": 6010, "time_per_iteration": 4.361377000808716 }, { "auxiliary_loss_clip": 0.01104786, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.04665053, "balance_loss_mlp": 1.02091312, "epoch": 0.3614008717871637, "flos": 20371189847040.0, "grad_norm": 1.759609272436165, "language_loss": 0.83214396, "learning_rate": 2.9543545990791554e-06, "loss": 0.85354757, "num_input_tokens_seen": 129090225, "step": 6011, "time_per_iteration": 2.679145574569702 }, { "auxiliary_loss_clip": 0.01141696, "auxiliary_loss_mlp": 0.01041601, "balance_loss_clip": 1.05070031, "balance_loss_mlp": 1.02562666, "epoch": 0.36146099503983165, "flos": 22776665800320.0, "grad_norm": 2.194420173883677, "language_loss": 0.62446111, "learning_rate": 2.954012319316727e-06, "loss": 0.64629406, "num_input_tokens_seen": 129107685, "step": 6012, "time_per_iteration": 2.6012516021728516 }, { "auxiliary_loss_clip": 0.01106556, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.04518831, "balance_loss_mlp": 1.02368951, "epoch": 0.3615211182924996, "flos": 22996654646400.0, "grad_norm": 1.831524666449312, "language_loss": 0.8381623, "learning_rate": 2.9536700033780565e-06, "loss": 0.85961026, "num_input_tokens_seen": 129125315, "step": 6013, "time_per_iteration": 2.7191901206970215 }, { "auxiliary_loss_clip": 0.01131608, "auxiliary_loss_mlp": 0.01040321, "balance_loss_clip": 1.04590511, "balance_loss_mlp": 1.02466893, "epoch": 0.3615812415451676, "flos": 16648079287680.0, "grad_norm": 3.6755742539930285, "language_loss": 0.91541535, "learning_rate": 2.9533276512761228e-06, "loss": 0.93713462, "num_input_tokens_seen": 129141600, "step": 6014, "time_per_iteration": 2.714121103286743 }, { "auxiliary_loss_clip": 0.01131507, "auxiliary_loss_mlp": 0.01042414, "balance_loss_clip": 1.0463829, "balance_loss_mlp": 1.0268693, "epoch": 0.36164136479783554, "flos": 21320093387520.0, "grad_norm": 2.2181121985150094, "language_loss": 0.73578274, "learning_rate": 2.95298526302391e-06, "loss": 0.75752199, "num_input_tokens_seen": 129160665, "step": 6015, "time_per_iteration": 2.668600082397461 }, { "auxiliary_loss_clip": 0.0105036, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.03610015, "balance_loss_mlp": 1.02980912, "epoch": 0.3617014880505035, "flos": 24169569356160.0, "grad_norm": 2.2662955263586158, "language_loss": 0.64756966, "learning_rate": 2.9526428386344e-06, "loss": 0.66855025, "num_input_tokens_seen": 129179220, "step": 6016, "time_per_iteration": 2.8753597736358643 }, { "auxiliary_loss_clip": 0.01127577, "auxiliary_loss_mlp": 0.01039172, "balance_loss_clip": 1.05000329, "balance_loss_mlp": 1.02170801, "epoch": 0.3617616113031715, "flos": 39014824101120.0, "grad_norm": 2.0483319793753343, "language_loss": 0.71927178, "learning_rate": 2.9523003781205785e-06, "loss": 0.74093938, "num_input_tokens_seen": 129200385, "step": 6017, "time_per_iteration": 2.8195903301239014 }, { "auxiliary_loss_clip": 0.01123165, "auxiliary_loss_mlp": 0.01043013, "balance_loss_clip": 1.04506993, "balance_loss_mlp": 1.02724147, "epoch": 0.3618217345558395, "flos": 12130840892160.0, "grad_norm": 2.196881428409859, "language_loss": 0.73543239, "learning_rate": 2.9519578814954307e-06, "loss": 0.7570942, "num_input_tokens_seen": 129217395, "step": 6018, "time_per_iteration": 2.6454639434814453 }, { "auxiliary_loss_clip": 0.01088616, "auxiliary_loss_mlp": 0.01036025, "balance_loss_clip": 1.0470562, "balance_loss_mlp": 1.02079058, "epoch": 0.36188185780850746, "flos": 24935005203840.0, "grad_norm": 2.8373114264415222, "language_loss": 0.69157374, "learning_rate": 2.9516153487719448e-06, "loss": 0.71282017, "num_input_tokens_seen": 129238940, "step": 6019, "time_per_iteration": 2.824361801147461 }, { "auxiliary_loss_clip": 0.0111438, "auxiliary_loss_mlp": 0.0103897, "balance_loss_clip": 1.04542887, "balance_loss_mlp": 1.02275765, "epoch": 0.3619419810611754, "flos": 20958832350720.0, "grad_norm": 3.405770043894724, "language_loss": 0.76428473, "learning_rate": 2.95127277996311e-06, "loss": 0.78581828, "num_input_tokens_seen": 129258240, "step": 6020, "time_per_iteration": 2.6757993698120117 }, { "auxiliary_loss_clip": 0.01124662, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.04899478, "balance_loss_mlp": 1.02512705, "epoch": 0.3620021043138434, "flos": 22528882805760.0, "grad_norm": 2.1413312386751606, "language_loss": 0.73802006, "learning_rate": 2.9509301750819156e-06, "loss": 0.7596817, "num_input_tokens_seen": 129279040, "step": 6021, "time_per_iteration": 2.6422386169433594 }, { "auxiliary_loss_clip": 0.01094575, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.04502845, "balance_loss_mlp": 1.02170944, "epoch": 0.36206222756651135, "flos": 15596687266560.0, "grad_norm": 8.65046906858069, "language_loss": 0.80683851, "learning_rate": 2.9505875341413533e-06, "loss": 0.82814515, "num_input_tokens_seen": 129295415, "step": 6022, "time_per_iteration": 2.7069809436798096 }, { "auxiliary_loss_clip": 0.0112144, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.04967427, "balance_loss_mlp": 1.02036762, "epoch": 0.3621223508191793, "flos": 23587170238080.0, "grad_norm": 1.6359940708258738, "language_loss": 0.81630391, "learning_rate": 2.950244857154417e-06, "loss": 0.83786309, "num_input_tokens_seen": 129312620, "step": 6023, "time_per_iteration": 2.676196575164795 }, { "auxiliary_loss_clip": 0.01115391, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.04994166, "balance_loss_mlp": 1.02266037, "epoch": 0.3621824740718473, "flos": 22309899540480.0, "grad_norm": 2.238629896510925, "language_loss": 0.79401833, "learning_rate": 2.9499021441341e-06, "loss": 0.81555158, "num_input_tokens_seen": 129331825, "step": 6024, "time_per_iteration": 2.6479294300079346 }, { "auxiliary_loss_clip": 0.01098352, "auxiliary_loss_mlp": 0.01041698, "balance_loss_clip": 1.04168642, "balance_loss_mlp": 1.02567625, "epoch": 0.36224259732451525, "flos": 16763640318720.0, "grad_norm": 2.1016508822119517, "language_loss": 0.74409318, "learning_rate": 2.9495593950933997e-06, "loss": 0.76549369, "num_input_tokens_seen": 129350400, "step": 6025, "time_per_iteration": 2.720113515853882 }, { "auxiliary_loss_clip": 0.01121634, "auxiliary_loss_mlp": 0.00772492, "balance_loss_clip": 1.04758501, "balance_loss_mlp": 1.00045466, "epoch": 0.3623027205771832, "flos": 23149742411520.0, "grad_norm": 1.7192758683210898, "language_loss": 0.72363192, "learning_rate": 2.9492166100453107e-06, "loss": 0.74257314, "num_input_tokens_seen": 129371155, "step": 6026, "time_per_iteration": 2.647515296936035 }, { "auxiliary_loss_clip": 0.01130763, "auxiliary_loss_mlp": 0.01045791, "balance_loss_clip": 1.05090141, "balance_loss_mlp": 1.0300554, "epoch": 0.3623628438298512, "flos": 28549162834560.0, "grad_norm": 3.1509295844270166, "language_loss": 0.79584157, "learning_rate": 2.948873789002833e-06, "loss": 0.81760705, "num_input_tokens_seen": 129391230, "step": 6027, "time_per_iteration": 2.666778802871704 }, { "auxiliary_loss_clip": 0.01112806, "auxiliary_loss_mlp": 0.01044567, "balance_loss_clip": 1.04690945, "balance_loss_mlp": 1.02730584, "epoch": 0.36242296708251914, "flos": 25484941405440.0, "grad_norm": 2.036912075012155, "language_loss": 0.67857373, "learning_rate": 2.9485309319789667e-06, "loss": 0.70014751, "num_input_tokens_seen": 129410065, "step": 6028, "time_per_iteration": 2.721635103225708 }, { "auxiliary_loss_clip": 0.01093428, "auxiliary_loss_mlp": 0.01039806, "balance_loss_clip": 1.04534137, "balance_loss_mlp": 1.02493429, "epoch": 0.3624830903351871, "flos": 16290373697280.0, "grad_norm": 2.040296243102333, "language_loss": 0.85588348, "learning_rate": 2.9481880389867117e-06, "loss": 0.87721586, "num_input_tokens_seen": 129428655, "step": 6029, "time_per_iteration": 2.768638849258423 }, { "auxiliary_loss_clip": 0.01097178, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.04583371, "balance_loss_mlp": 1.02534389, "epoch": 0.36254321358785513, "flos": 18296307694080.0, "grad_norm": 1.826841085229912, "language_loss": 0.72638077, "learning_rate": 2.9478451100390714e-06, "loss": 0.74775726, "num_input_tokens_seen": 129447845, "step": 6030, "time_per_iteration": 2.6222145557403564 }, { "auxiliary_loss_clip": 0.01111443, "auxiliary_loss_mlp": 0.0104401, "balance_loss_clip": 1.0471518, "balance_loss_mlp": 1.02635479, "epoch": 0.3626033368405231, "flos": 14865294533760.0, "grad_norm": 2.682823168265615, "language_loss": 0.74219912, "learning_rate": 2.94750214514905e-06, "loss": 0.76375365, "num_input_tokens_seen": 129463275, "step": 6031, "time_per_iteration": 2.62003493309021 }, { "auxiliary_loss_clip": 0.01090216, "auxiliary_loss_mlp": 0.01046109, "balance_loss_clip": 1.04174352, "balance_loss_mlp": 1.03031349, "epoch": 0.36266346009319106, "flos": 22306595489280.0, "grad_norm": 2.122404426395552, "language_loss": 0.72930032, "learning_rate": 2.9471591443296516e-06, "loss": 0.75066358, "num_input_tokens_seen": 129483205, "step": 6032, "time_per_iteration": 2.7382266521453857 }, { "auxiliary_loss_clip": 0.01089342, "auxiliary_loss_mlp": 0.0104871, "balance_loss_clip": 1.0457828, "balance_loss_mlp": 1.03320134, "epoch": 0.362723583345859, "flos": 18222331633920.0, "grad_norm": 2.0052695882675895, "language_loss": 0.77577424, "learning_rate": 2.946816107593884e-06, "loss": 0.79715478, "num_input_tokens_seen": 129499885, "step": 6033, "time_per_iteration": 2.712574005126953 }, { "auxiliary_loss_clip": 0.01011518, "auxiliary_loss_mlp": 0.01010455, "balance_loss_clip": 1.02346182, "balance_loss_mlp": 1.00881004, "epoch": 0.362783706598527, "flos": 68499174458880.0, "grad_norm": 0.775881514372135, "language_loss": 0.6472615, "learning_rate": 2.9464730349547547e-06, "loss": 0.66748118, "num_input_tokens_seen": 129561885, "step": 6034, "time_per_iteration": 3.33389949798584 }, { "auxiliary_loss_clip": 0.0111586, "auxiliary_loss_mlp": 0.01039589, "balance_loss_clip": 1.04362679, "balance_loss_mlp": 1.02373409, "epoch": 0.36284382985119495, "flos": 26576589594240.0, "grad_norm": 2.348469757016237, "language_loss": 0.89869213, "learning_rate": 2.946129926425273e-06, "loss": 0.9202466, "num_input_tokens_seen": 129582325, "step": 6035, "time_per_iteration": 2.661137580871582 }, { "auxiliary_loss_clip": 0.01112128, "auxiliary_loss_mlp": 0.01040682, "balance_loss_clip": 1.04810882, "balance_loss_mlp": 1.02445734, "epoch": 0.3629039531038629, "flos": 20156767608960.0, "grad_norm": 1.7965494412259506, "language_loss": 0.73480749, "learning_rate": 2.9457867820184496e-06, "loss": 0.75633562, "num_input_tokens_seen": 129600350, "step": 6036, "time_per_iteration": 2.627746105194092 }, { "auxiliary_loss_clip": 0.01118939, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 1.0476563, "balance_loss_mlp": 1.01825309, "epoch": 0.3629640763565309, "flos": 18625716345600.0, "grad_norm": 2.247638401714898, "language_loss": 0.75895989, "learning_rate": 2.945443601747297e-06, "loss": 0.78049135, "num_input_tokens_seen": 129618425, "step": 6037, "time_per_iteration": 2.6763134002685547 }, { "auxiliary_loss_clip": 0.01117432, "auxiliary_loss_mlp": 0.0105958, "balance_loss_clip": 1.04722893, "balance_loss_mlp": 1.04149556, "epoch": 0.36302419960919885, "flos": 19571459489280.0, "grad_norm": 1.7641921793444904, "language_loss": 0.78425813, "learning_rate": 2.945100385624828e-06, "loss": 0.80602825, "num_input_tokens_seen": 129636750, "step": 6038, "time_per_iteration": 2.6576154232025146 }, { "auxiliary_loss_clip": 0.01042272, "auxiliary_loss_mlp": 0.01000075, "balance_loss_clip": 1.02576721, "balance_loss_mlp": 0.99842948, "epoch": 0.3630843228618668, "flos": 63797606444160.0, "grad_norm": 0.8328343708327894, "language_loss": 0.63371962, "learning_rate": 2.9447571336640573e-06, "loss": 0.6541431, "num_input_tokens_seen": 129699030, "step": 6039, "time_per_iteration": 3.268035650253296 }, { "auxiliary_loss_clip": 0.01108663, "auxiliary_loss_mlp": 0.01052032, "balance_loss_clip": 1.04687905, "balance_loss_mlp": 1.03485394, "epoch": 0.3631444461145348, "flos": 21835160461440.0, "grad_norm": 2.83972356132426, "language_loss": 0.71349055, "learning_rate": 2.944413845878002e-06, "loss": 0.73509747, "num_input_tokens_seen": 129717135, "step": 6040, "time_per_iteration": 2.7468066215515137 }, { "auxiliary_loss_clip": 0.01129452, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.05027485, "balance_loss_mlp": 1.02372289, "epoch": 0.36320456936720275, "flos": 21722041555200.0, "grad_norm": 1.6017927687359714, "language_loss": 0.81615877, "learning_rate": 2.9440705222796783e-06, "loss": 0.83785057, "num_input_tokens_seen": 129735940, "step": 6041, "time_per_iteration": 2.6624767780303955 }, { "auxiliary_loss_clip": 0.01116373, "auxiliary_loss_mlp": 0.01037475, "balance_loss_clip": 1.04789138, "balance_loss_mlp": 1.02039289, "epoch": 0.3632646926198707, "flos": 17019072910080.0, "grad_norm": 6.335898198250863, "language_loss": 0.83848882, "learning_rate": 2.943727162882107e-06, "loss": 0.86002731, "num_input_tokens_seen": 129752790, "step": 6042, "time_per_iteration": 2.6279616355895996 }, { "auxiliary_loss_clip": 0.01113831, "auxiliary_loss_mlp": 0.01045895, "balance_loss_clip": 1.04817295, "balance_loss_mlp": 1.03020668, "epoch": 0.36332481587253873, "flos": 23331163029120.0, "grad_norm": 1.8194124872693949, "language_loss": 0.78401059, "learning_rate": 2.9433837676983064e-06, "loss": 0.80560786, "num_input_tokens_seen": 129773655, "step": 6043, "time_per_iteration": 4.221862077713013 }, { "auxiliary_loss_clip": 0.01111193, "auxiliary_loss_mlp": 0.01036813, "balance_loss_clip": 1.05454051, "balance_loss_mlp": 1.02078581, "epoch": 0.3633849391252067, "flos": 10743539857920.0, "grad_norm": 2.743973887678544, "language_loss": 0.65664518, "learning_rate": 2.943040336741298e-06, "loss": 0.67812526, "num_input_tokens_seen": 129791605, "step": 6044, "time_per_iteration": 2.7301173210144043 }, { "auxiliary_loss_clip": 0.01109397, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.04838157, "balance_loss_mlp": 1.02035475, "epoch": 0.36344506237787466, "flos": 25849147357440.0, "grad_norm": 2.5365479968338187, "language_loss": 0.81149542, "learning_rate": 2.9426968700241066e-06, "loss": 0.83293915, "num_input_tokens_seen": 129811075, "step": 6045, "time_per_iteration": 2.6896753311157227 }, { "auxiliary_loss_clip": 0.0110304, "auxiliary_loss_mlp": 0.01045503, "balance_loss_clip": 1.04706383, "balance_loss_mlp": 1.02923083, "epoch": 0.3635051856305426, "flos": 30154046503680.0, "grad_norm": 2.400629400498793, "language_loss": 0.65010375, "learning_rate": 2.942353367559755e-06, "loss": 0.67158914, "num_input_tokens_seen": 129833755, "step": 6046, "time_per_iteration": 2.800321578979492 }, { "auxiliary_loss_clip": 0.01102544, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.0467155, "balance_loss_mlp": 1.02399993, "epoch": 0.3635653088832106, "flos": 22198396746240.0, "grad_norm": 2.172977049503826, "language_loss": 0.77142686, "learning_rate": 2.9420098293612692e-06, "loss": 0.79284167, "num_input_tokens_seen": 129854475, "step": 6047, "time_per_iteration": 4.274283170700073 }, { "auxiliary_loss_clip": 0.01137356, "auxiliary_loss_mlp": 0.01047564, "balance_loss_clip": 1.05142486, "balance_loss_mlp": 1.02983761, "epoch": 0.36362543213587856, "flos": 24787053083520.0, "grad_norm": 1.922622021112015, "language_loss": 0.79610157, "learning_rate": 2.9416662554416767e-06, "loss": 0.81795079, "num_input_tokens_seen": 129873530, "step": 6048, "time_per_iteration": 4.283480644226074 }, { "auxiliary_loss_clip": 0.01037942, "auxiliary_loss_mlp": 0.01005664, "balance_loss_clip": 1.01860034, "balance_loss_mlp": 1.00387573, "epoch": 0.3636855553885465, "flos": 62526369231360.0, "grad_norm": 0.749844121463454, "language_loss": 0.52550006, "learning_rate": 2.9413226458140054e-06, "loss": 0.54593611, "num_input_tokens_seen": 129940400, "step": 6049, "time_per_iteration": 3.2647299766540527 }, { "auxiliary_loss_clip": 0.01105759, "auxiliary_loss_mlp": 0.01042028, "balance_loss_clip": 1.04831481, "balance_loss_mlp": 1.02467084, "epoch": 0.3637456786412145, "flos": 24060652341120.0, "grad_norm": 9.722138117523357, "language_loss": 0.8628068, "learning_rate": 2.9409790004912845e-06, "loss": 0.88428462, "num_input_tokens_seen": 129958635, "step": 6050, "time_per_iteration": 2.744236469268799 }, { "auxiliary_loss_clip": 0.01120328, "auxiliary_loss_mlp": 0.00772785, "balance_loss_clip": 1.04944158, "balance_loss_mlp": 1.0004611, "epoch": 0.36380580189388245, "flos": 16691495852160.0, "grad_norm": 3.109361789309709, "language_loss": 0.78116536, "learning_rate": 2.940635319486546e-06, "loss": 0.80009651, "num_input_tokens_seen": 129977685, "step": 6051, "time_per_iteration": 2.6305320262908936 }, { "auxiliary_loss_clip": 0.01127196, "auxiliary_loss_mlp": 0.01040856, "balance_loss_clip": 1.04900503, "balance_loss_mlp": 1.02559745, "epoch": 0.3638659251465504, "flos": 25114091437440.0, "grad_norm": 1.9275322741448784, "language_loss": 0.82526582, "learning_rate": 2.940291602812822e-06, "loss": 0.84694636, "num_input_tokens_seen": 129997530, "step": 6052, "time_per_iteration": 2.711794853210449 }, { "auxiliary_loss_clip": 0.01100415, "auxiliary_loss_mlp": 0.01036967, "balance_loss_clip": 1.04675376, "balance_loss_mlp": 1.02270949, "epoch": 0.3639260483992184, "flos": 23003011353600.0, "grad_norm": 1.7820298413079305, "language_loss": 0.72085792, "learning_rate": 2.939947850483145e-06, "loss": 0.74223173, "num_input_tokens_seen": 130017955, "step": 6053, "time_per_iteration": 2.725600481033325 }, { "auxiliary_loss_clip": 0.01015406, "auxiliary_loss_mlp": 0.01003631, "balance_loss_clip": 1.0300014, "balance_loss_mlp": 1.00155663, "epoch": 0.36398617165188635, "flos": 70716011160960.0, "grad_norm": 0.7712310074836012, "language_loss": 0.61214095, "learning_rate": 2.9396040625105532e-06, "loss": 0.63233131, "num_input_tokens_seen": 130074275, "step": 6054, "time_per_iteration": 3.3252007961273193 }, { "auxiliary_loss_clip": 0.0111079, "auxiliary_loss_mlp": 0.01038999, "balance_loss_clip": 1.04735899, "balance_loss_mlp": 1.02214301, "epoch": 0.3640462949045543, "flos": 22235456603520.0, "grad_norm": 2.93078334140581, "language_loss": 0.75820959, "learning_rate": 2.9392602389080802e-06, "loss": 0.77970749, "num_input_tokens_seen": 130091375, "step": 6055, "time_per_iteration": 2.656001091003418 }, { "auxiliary_loss_clip": 0.0113529, "auxiliary_loss_mlp": 0.01041525, "balance_loss_clip": 1.04910016, "balance_loss_mlp": 1.02581286, "epoch": 0.3641064181572223, "flos": 21543529939200.0, "grad_norm": 1.6734377169093124, "language_loss": 0.7533145, "learning_rate": 2.938916379688765e-06, "loss": 0.77508265, "num_input_tokens_seen": 130111595, "step": 6056, "time_per_iteration": 2.654418468475342 }, { "auxiliary_loss_clip": 0.01121707, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.055071, "balance_loss_mlp": 1.02337217, "epoch": 0.3641665414098903, "flos": 22273306560000.0, "grad_norm": 2.035168503846255, "language_loss": 0.80473512, "learning_rate": 2.9385724848656468e-06, "loss": 0.82633936, "num_input_tokens_seen": 130131440, "step": 6057, "time_per_iteration": 2.7347753047943115 }, { "auxiliary_loss_clip": 0.01107128, "auxiliary_loss_mlp": 0.01039802, "balance_loss_clip": 1.04495037, "balance_loss_mlp": 1.02438855, "epoch": 0.36422666466255826, "flos": 28329676778880.0, "grad_norm": 2.043030499006847, "language_loss": 0.80264485, "learning_rate": 2.9382285544517647e-06, "loss": 0.8241142, "num_input_tokens_seen": 130151375, "step": 6058, "time_per_iteration": 2.695674180984497 }, { "auxiliary_loss_clip": 0.01102831, "auxiliary_loss_mlp": 0.00772601, "balance_loss_clip": 1.04357934, "balance_loss_mlp": 1.00046432, "epoch": 0.36428678791522623, "flos": 24170503109760.0, "grad_norm": 2.032310914115462, "language_loss": 0.84994543, "learning_rate": 2.9378845884601636e-06, "loss": 0.86869979, "num_input_tokens_seen": 130169960, "step": 6059, "time_per_iteration": 2.6912410259246826 }, { "auxiliary_loss_clip": 0.01093721, "auxiliary_loss_mlp": 0.01039242, "balance_loss_clip": 1.04318213, "balance_loss_mlp": 1.02287483, "epoch": 0.3643469111678942, "flos": 22528451842560.0, "grad_norm": 5.903326132338396, "language_loss": 0.87806225, "learning_rate": 2.937540586903884e-06, "loss": 0.89939183, "num_input_tokens_seen": 130189800, "step": 6060, "time_per_iteration": 2.713115692138672 }, { "auxiliary_loss_clip": 0.01125791, "auxiliary_loss_mlp": 0.01040312, "balance_loss_clip": 1.0498302, "balance_loss_mlp": 1.02388453, "epoch": 0.36440703442056216, "flos": 19426595938560.0, "grad_norm": 2.3521788015610805, "language_loss": 0.66954017, "learning_rate": 2.937196549795971e-06, "loss": 0.69120121, "num_input_tokens_seen": 130206370, "step": 6061, "time_per_iteration": 2.8435866832733154 }, { "auxiliary_loss_clip": 0.0111942, "auxiliary_loss_mlp": 0.01038694, "balance_loss_clip": 1.05207086, "balance_loss_mlp": 1.02260041, "epoch": 0.3644671576732301, "flos": 18040515966720.0, "grad_norm": 2.5119296796020354, "language_loss": 0.75012159, "learning_rate": 2.9368524771494718e-06, "loss": 0.77170277, "num_input_tokens_seen": 130224445, "step": 6062, "time_per_iteration": 2.659853935241699 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.04851866, "balance_loss_mlp": 1.01628149, "epoch": 0.3645272809258981, "flos": 21542811667200.0, "grad_norm": 2.568706719167558, "language_loss": 0.72070628, "learning_rate": 2.936508368977432e-06, "loss": 0.74213189, "num_input_tokens_seen": 130245380, "step": 6063, "time_per_iteration": 2.7098159790039062 }, { "auxiliary_loss_clip": 0.01118768, "auxiliary_loss_mlp": 0.010373, "balance_loss_clip": 1.04472148, "balance_loss_mlp": 1.02187479, "epoch": 0.36458740417856605, "flos": 22746860490240.0, "grad_norm": 2.3511982692020936, "language_loss": 0.68179435, "learning_rate": 2.936164225292901e-06, "loss": 0.70335501, "num_input_tokens_seen": 130265575, "step": 6064, "time_per_iteration": 2.6513044834136963 }, { "auxiliary_loss_clip": 0.01116627, "auxiliary_loss_mlp": 0.01045789, "balance_loss_clip": 1.04925466, "balance_loss_mlp": 1.02988076, "epoch": 0.364647527431234, "flos": 26140670138880.0, "grad_norm": 1.9840367281230236, "language_loss": 0.74147421, "learning_rate": 2.9358200461089297e-06, "loss": 0.76309836, "num_input_tokens_seen": 130286195, "step": 6065, "time_per_iteration": 2.764556407928467 }, { "auxiliary_loss_clip": 0.0111688, "auxiliary_loss_mlp": 0.01040465, "balance_loss_clip": 1.04924774, "balance_loss_mlp": 1.02306008, "epoch": 0.364707650683902, "flos": 31029907737600.0, "grad_norm": 2.0108238901766042, "language_loss": 0.75444913, "learning_rate": 2.9354758314385676e-06, "loss": 0.77602255, "num_input_tokens_seen": 130306095, "step": 6066, "time_per_iteration": 2.749293088912964 }, { "auxiliary_loss_clip": 0.01121102, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.04859555, "balance_loss_mlp": 1.02010643, "epoch": 0.36476777393656995, "flos": 19572896033280.0, "grad_norm": 2.8385875288429587, "language_loss": 0.76480901, "learning_rate": 2.9351315812948684e-06, "loss": 0.78636676, "num_input_tokens_seen": 130324685, "step": 6067, "time_per_iteration": 2.619833469390869 }, { "auxiliary_loss_clip": 0.01135088, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.05067635, "balance_loss_mlp": 1.02401567, "epoch": 0.3648278971892379, "flos": 17748849530880.0, "grad_norm": 2.2214902441228563, "language_loss": 0.71036232, "learning_rate": 2.934787295690886e-06, "loss": 0.73209393, "num_input_tokens_seen": 130343855, "step": 6068, "time_per_iteration": 2.633678674697876 }, { "auxiliary_loss_clip": 0.01119276, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.0432384, "balance_loss_mlp": 1.02402711, "epoch": 0.3648880204419059, "flos": 17931167988480.0, "grad_norm": 2.184109901605664, "language_loss": 0.74421692, "learning_rate": 2.9344429746396755e-06, "loss": 0.76580441, "num_input_tokens_seen": 130362320, "step": 6069, "time_per_iteration": 2.6463425159454346 }, { "auxiliary_loss_clip": 0.01115147, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.04814148, "balance_loss_mlp": 1.02237022, "epoch": 0.3649481436945739, "flos": 22638266697600.0, "grad_norm": 1.8874088651190308, "language_loss": 0.66247845, "learning_rate": 2.9340986181542945e-06, "loss": 0.68401062, "num_input_tokens_seen": 130383165, "step": 6070, "time_per_iteration": 2.70835280418396 }, { "auxiliary_loss_clip": 0.01118852, "auxiliary_loss_mlp": 0.01036547, "balance_loss_clip": 1.04837227, "balance_loss_mlp": 1.02161574, "epoch": 0.36500826694724187, "flos": 21579656042880.0, "grad_norm": 1.882521473859371, "language_loss": 0.74406028, "learning_rate": 2.9337542262477994e-06, "loss": 0.76561427, "num_input_tokens_seen": 130402425, "step": 6071, "time_per_iteration": 2.6479921340942383 }, { "auxiliary_loss_clip": 0.0112348, "auxiliary_loss_mlp": 0.01037332, "balance_loss_clip": 1.04683149, "balance_loss_mlp": 1.02142978, "epoch": 0.36506839019990983, "flos": 13772533023360.0, "grad_norm": 1.9443656652026238, "language_loss": 0.88592315, "learning_rate": 2.9334097989332506e-06, "loss": 0.9075312, "num_input_tokens_seen": 130419440, "step": 6072, "time_per_iteration": 2.641340732574463 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.0495832, "balance_loss_mlp": 1.02225924, "epoch": 0.3651285134525778, "flos": 17274972378240.0, "grad_norm": 2.382408041683643, "language_loss": 0.72436309, "learning_rate": 2.9330653362237094e-06, "loss": 0.7459538, "num_input_tokens_seen": 130438495, "step": 6073, "time_per_iteration": 2.6814513206481934 }, { "auxiliary_loss_clip": 0.01067321, "auxiliary_loss_mlp": 0.01042007, "balance_loss_clip": 1.04483008, "balance_loss_mlp": 1.0249722, "epoch": 0.36518863670524576, "flos": 21907987286400.0, "grad_norm": 3.1332797030940913, "language_loss": 0.66850221, "learning_rate": 2.932720838132236e-06, "loss": 0.68959546, "num_input_tokens_seen": 130455575, "step": 6074, "time_per_iteration": 2.7943460941314697 }, { "auxiliary_loss_clip": 0.01103652, "auxiliary_loss_mlp": 0.01037343, "balance_loss_clip": 1.04833269, "balance_loss_mlp": 1.02238262, "epoch": 0.3652487599579137, "flos": 27122180250240.0, "grad_norm": 1.5371260958261816, "language_loss": 0.72812623, "learning_rate": 2.9323763046718954e-06, "loss": 0.74953616, "num_input_tokens_seen": 130476385, "step": 6075, "time_per_iteration": 2.7581374645233154 }, { "auxiliary_loss_clip": 0.01100578, "auxiliary_loss_mlp": 0.01046604, "balance_loss_clip": 1.04679585, "balance_loss_mlp": 1.03011715, "epoch": 0.3653088832105817, "flos": 19755573626880.0, "grad_norm": 2.1248471900324186, "language_loss": 0.89377797, "learning_rate": 2.9320317358557524e-06, "loss": 0.91524976, "num_input_tokens_seen": 130493630, "step": 6076, "time_per_iteration": 2.7085182666778564 }, { "auxiliary_loss_clip": 0.01125287, "auxiliary_loss_mlp": 0.01043945, "balance_loss_clip": 1.0504595, "balance_loss_mlp": 1.02784586, "epoch": 0.36536900646324966, "flos": 13115008609920.0, "grad_norm": 2.218138292044272, "language_loss": 0.69377828, "learning_rate": 2.931687131696872e-06, "loss": 0.71547067, "num_input_tokens_seen": 130510735, "step": 6077, "time_per_iteration": 2.6516926288604736 }, { "auxiliary_loss_clip": 0.01063406, "auxiliary_loss_mlp": 0.01003112, "balance_loss_clip": 1.03200221, "balance_loss_mlp": 1.00121677, "epoch": 0.3654291297159176, "flos": 71100472383360.0, "grad_norm": 0.7484778409156561, "language_loss": 0.61802375, "learning_rate": 2.9313424922083224e-06, "loss": 0.63868892, "num_input_tokens_seen": 130577050, "step": 6078, "time_per_iteration": 3.2192225456237793 }, { "auxiliary_loss_clip": 0.01105852, "auxiliary_loss_mlp": 0.01053011, "balance_loss_clip": 1.04234397, "balance_loss_mlp": 1.03565383, "epoch": 0.3654892529685856, "flos": 23617478338560.0, "grad_norm": 2.6620805395927283, "language_loss": 0.78445792, "learning_rate": 2.930997817403173e-06, "loss": 0.80604661, "num_input_tokens_seen": 130593780, "step": 6079, "time_per_iteration": 2.6616902351379395 }, { "auxiliary_loss_clip": 0.01129934, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.05226243, "balance_loss_mlp": 1.02386224, "epoch": 0.36554937622125355, "flos": 43470799850880.0, "grad_norm": 2.4767906644356037, "language_loss": 0.62662333, "learning_rate": 2.9306531072944913e-06, "loss": 0.64832425, "num_input_tokens_seen": 130615510, "step": 6080, "time_per_iteration": 2.8651509284973145 }, { "auxiliary_loss_clip": 0.01108292, "auxiliary_loss_mlp": 0.01042236, "balance_loss_clip": 1.04737091, "balance_loss_mlp": 1.02529645, "epoch": 0.3656094994739215, "flos": 23294641875840.0, "grad_norm": 3.1314387429818327, "language_loss": 0.67686033, "learning_rate": 2.930308361895352e-06, "loss": 0.69836557, "num_input_tokens_seen": 130635410, "step": 6081, "time_per_iteration": 2.707031011581421 }, { "auxiliary_loss_clip": 0.01112746, "auxiliary_loss_mlp": 0.00773158, "balance_loss_clip": 1.04989302, "balance_loss_mlp": 1.00033236, "epoch": 0.3656696227265895, "flos": 24571984400640.0, "grad_norm": 1.5854068035466964, "language_loss": 0.74755692, "learning_rate": 2.9299635812188257e-06, "loss": 0.76641595, "num_input_tokens_seen": 130657725, "step": 6082, "time_per_iteration": 2.7261881828308105 }, { "auxiliary_loss_clip": 0.01072732, "auxiliary_loss_mlp": 0.00772597, "balance_loss_clip": 1.04222691, "balance_loss_mlp": 1.00042963, "epoch": 0.3657297459792575, "flos": 27928375056000.0, "grad_norm": 2.051480252043875, "language_loss": 0.82956016, "learning_rate": 2.929618765277987e-06, "loss": 0.8480134, "num_input_tokens_seen": 130678360, "step": 6083, "time_per_iteration": 4.360748529434204 }, { "auxiliary_loss_clip": 0.01041394, "auxiliary_loss_mlp": 0.01001412, "balance_loss_clip": 1.02900386, "balance_loss_mlp": 0.99936181, "epoch": 0.36578986923192547, "flos": 67392622126080.0, "grad_norm": 0.8163771270511553, "language_loss": 0.59314513, "learning_rate": 2.9292739140859125e-06, "loss": 0.61357319, "num_input_tokens_seen": 130742110, "step": 6084, "time_per_iteration": 3.3273561000823975 }, { "auxiliary_loss_clip": 0.0109183, "auxiliary_loss_mlp": 0.0104143, "balance_loss_clip": 1.04496968, "balance_loss_mlp": 1.02570593, "epoch": 0.36584999248459343, "flos": 20227511445120.0, "grad_norm": 3.4329037043843478, "language_loss": 0.72791892, "learning_rate": 2.9289290276556767e-06, "loss": 0.74925154, "num_input_tokens_seen": 130759870, "step": 6085, "time_per_iteration": 2.7221856117248535 }, { "auxiliary_loss_clip": 0.01101549, "auxiliary_loss_mlp": 0.01038512, "balance_loss_clip": 1.04982924, "balance_loss_mlp": 1.02383745, "epoch": 0.3659101157372614, "flos": 19062461813760.0, "grad_norm": 2.636651052815632, "language_loss": 0.77860379, "learning_rate": 2.9285841060003604e-06, "loss": 0.80000436, "num_input_tokens_seen": 130778510, "step": 6086, "time_per_iteration": 4.265977621078491 }, { "auxiliary_loss_clip": 0.0111591, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.04616153, "balance_loss_mlp": 1.01771855, "epoch": 0.36597023898992936, "flos": 30810708990720.0, "grad_norm": 1.8562986050024126, "language_loss": 0.76759315, "learning_rate": 2.9282391491330416e-06, "loss": 0.78907776, "num_input_tokens_seen": 130798535, "step": 6087, "time_per_iteration": 4.227373123168945 }, { "auxiliary_loss_clip": 0.01081855, "auxiliary_loss_mlp": 0.01042282, "balance_loss_clip": 1.04556108, "balance_loss_mlp": 1.02589023, "epoch": 0.36603036224259733, "flos": 20521799573760.0, "grad_norm": 2.2476274891892474, "language_loss": 0.71063232, "learning_rate": 2.9278941570668002e-06, "loss": 0.73187363, "num_input_tokens_seen": 130816655, "step": 6088, "time_per_iteration": 4.3080058097839355 }, { "auxiliary_loss_clip": 0.01136094, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.05314517, "balance_loss_mlp": 1.02267289, "epoch": 0.3660904854952653, "flos": 38329397798400.0, "grad_norm": 1.6318023186273214, "language_loss": 0.79717827, "learning_rate": 2.92754912981472e-06, "loss": 0.81893623, "num_input_tokens_seen": 130841225, "step": 6089, "time_per_iteration": 2.782954216003418 }, { "auxiliary_loss_clip": 0.01099767, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.04514015, "balance_loss_mlp": 1.02220643, "epoch": 0.36615060874793326, "flos": 21835555511040.0, "grad_norm": 2.0312735397290043, "language_loss": 0.71617413, "learning_rate": 2.927204067389884e-06, "loss": 0.73753607, "num_input_tokens_seen": 130861050, "step": 6090, "time_per_iteration": 2.7414958477020264 }, { "auxiliary_loss_clip": 0.01105933, "auxiliary_loss_mlp": 0.01047805, "balance_loss_clip": 1.05133104, "balance_loss_mlp": 1.03305852, "epoch": 0.3662107320006012, "flos": 16581537342720.0, "grad_norm": 2.037307676788604, "language_loss": 0.74434924, "learning_rate": 2.9268589698053763e-06, "loss": 0.7658866, "num_input_tokens_seen": 130879775, "step": 6091, "time_per_iteration": 2.628554344177246 }, { "auxiliary_loss_clip": 0.01076087, "auxiliary_loss_mlp": 0.01042935, "balance_loss_clip": 1.04836047, "balance_loss_mlp": 1.02728868, "epoch": 0.3662708552532692, "flos": 20958365473920.0, "grad_norm": 2.1960531931019682, "language_loss": 0.73387206, "learning_rate": 2.926513837074284e-06, "loss": 0.75506234, "num_input_tokens_seen": 130898070, "step": 6092, "time_per_iteration": 2.7320556640625 }, { "auxiliary_loss_clip": 0.01127006, "auxiliary_loss_mlp": 0.01044139, "balance_loss_clip": 1.04809344, "balance_loss_mlp": 1.02796876, "epoch": 0.36633097850593715, "flos": 21902707987200.0, "grad_norm": 1.9967925590844784, "language_loss": 0.77662504, "learning_rate": 2.9261686692096942e-06, "loss": 0.79833645, "num_input_tokens_seen": 130915250, "step": 6093, "time_per_iteration": 2.721311092376709 }, { "auxiliary_loss_clip": 0.01124005, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.04696584, "balance_loss_mlp": 1.02686548, "epoch": 0.3663911017586051, "flos": 32854133808000.0, "grad_norm": 1.926436620767835, "language_loss": 0.7455743, "learning_rate": 2.925823466224696e-06, "loss": 0.76723486, "num_input_tokens_seen": 130936995, "step": 6094, "time_per_iteration": 2.767188310623169 }, { "auxiliary_loss_clip": 0.01142303, "auxiliary_loss_mlp": 0.01055832, "balance_loss_clip": 1.05334711, "balance_loss_mlp": 1.03969133, "epoch": 0.3664512250112731, "flos": 27271748482560.0, "grad_norm": 1.743331442809004, "language_loss": 0.79444361, "learning_rate": 2.9254782281323785e-06, "loss": 0.81642497, "num_input_tokens_seen": 130957970, "step": 6095, "time_per_iteration": 2.718632459640503 }, { "auxiliary_loss_clip": 0.01118218, "auxiliary_loss_mlp": 0.00774719, "balance_loss_clip": 1.05141842, "balance_loss_mlp": 1.00037265, "epoch": 0.3665113482639411, "flos": 17784436930560.0, "grad_norm": 3.4988865885900178, "language_loss": 0.73592722, "learning_rate": 2.925132954945834e-06, "loss": 0.75485659, "num_input_tokens_seen": 130974915, "step": 6096, "time_per_iteration": 2.674382448196411 }, { "auxiliary_loss_clip": 0.01099743, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04458702, "balance_loss_mlp": 1.02355742, "epoch": 0.36657147151660907, "flos": 27854614477440.0, "grad_norm": 2.41624095312735, "language_loss": 0.67081815, "learning_rate": 2.924787646678155e-06, "loss": 0.69220531, "num_input_tokens_seen": 130995745, "step": 6097, "time_per_iteration": 2.789118766784668 }, { "auxiliary_loss_clip": 0.01077673, "auxiliary_loss_mlp": 0.01038362, "balance_loss_clip": 1.04489172, "balance_loss_mlp": 1.02268624, "epoch": 0.36663159476927704, "flos": 25374013228800.0, "grad_norm": 1.4796406838499911, "language_loss": 0.77679402, "learning_rate": 2.9244423033424365e-06, "loss": 0.79795432, "num_input_tokens_seen": 131015545, "step": 6098, "time_per_iteration": 2.7803733348846436 }, { "auxiliary_loss_clip": 0.01122346, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.04734826, "balance_loss_mlp": 1.02987766, "epoch": 0.366691718021945, "flos": 21357225072000.0, "grad_norm": 1.744595499322522, "language_loss": 0.73707491, "learning_rate": 2.9240969249517723e-06, "loss": 0.75875127, "num_input_tokens_seen": 131033990, "step": 6099, "time_per_iteration": 2.6809163093566895 }, { "auxiliary_loss_clip": 0.01111202, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.04759586, "balance_loss_mlp": 1.02931285, "epoch": 0.36675184127461297, "flos": 16800376953600.0, "grad_norm": 1.8475933970370078, "language_loss": 0.84773195, "learning_rate": 2.9237515115192602e-06, "loss": 0.86928654, "num_input_tokens_seen": 131050710, "step": 6100, "time_per_iteration": 2.6730356216430664 }, { "auxiliary_loss_clip": 0.01102438, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.04448223, "balance_loss_mlp": 1.02181566, "epoch": 0.36681196452728093, "flos": 21906514828800.0, "grad_norm": 3.9532097547953104, "language_loss": 0.70893979, "learning_rate": 2.9234060630579992e-06, "loss": 0.73033994, "num_input_tokens_seen": 131071435, "step": 6101, "time_per_iteration": 2.7369589805603027 }, { "auxiliary_loss_clip": 0.01111262, "auxiliary_loss_mlp": 0.01052791, "balance_loss_clip": 1.05096185, "balance_loss_mlp": 1.0361371, "epoch": 0.3668720877799489, "flos": 17712436118400.0, "grad_norm": 2.286737474315047, "language_loss": 0.76634502, "learning_rate": 2.9230605795810865e-06, "loss": 0.7879855, "num_input_tokens_seen": 131088775, "step": 6102, "time_per_iteration": 2.7081708908081055 }, { "auxiliary_loss_clip": 0.01131629, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.0524683, "balance_loss_mlp": 1.02050483, "epoch": 0.36693221103261686, "flos": 47045455499520.0, "grad_norm": 4.369253140908342, "language_loss": 0.70019859, "learning_rate": 2.922715061101625e-06, "loss": 0.72188866, "num_input_tokens_seen": 131112800, "step": 6103, "time_per_iteration": 2.8610281944274902 }, { "auxiliary_loss_clip": 0.01093091, "auxiliary_loss_mlp": 0.0103895, "balance_loss_clip": 1.04730344, "balance_loss_mlp": 1.02283263, "epoch": 0.3669923342852848, "flos": 15960929132160.0, "grad_norm": 3.0883152470965842, "language_loss": 0.72272754, "learning_rate": 2.922369507632716e-06, "loss": 0.744048, "num_input_tokens_seen": 131131150, "step": 6104, "time_per_iteration": 2.7520432472229004 }, { "auxiliary_loss_clip": 0.01127975, "auxiliary_loss_mlp": 0.01036046, "balance_loss_clip": 1.05017686, "balance_loss_mlp": 1.01940393, "epoch": 0.3670524575379528, "flos": 19974485064960.0, "grad_norm": 2.1608886453477947, "language_loss": 0.81461251, "learning_rate": 2.9220239191874617e-06, "loss": 0.83625269, "num_input_tokens_seen": 131150365, "step": 6105, "time_per_iteration": 2.7565362453460693 }, { "auxiliary_loss_clip": 0.0114363, "auxiliary_loss_mlp": 0.01041522, "balance_loss_clip": 1.05170739, "balance_loss_mlp": 1.02526236, "epoch": 0.36711258079062076, "flos": 25702955003520.0, "grad_norm": 1.7202629897198451, "language_loss": 0.81035495, "learning_rate": 2.9216782957789692e-06, "loss": 0.83220649, "num_input_tokens_seen": 131169310, "step": 6106, "time_per_iteration": 2.73502779006958 }, { "auxiliary_loss_clip": 0.01035121, "auxiliary_loss_mlp": 0.00753905, "balance_loss_clip": 1.03131676, "balance_loss_mlp": 1.00104892, "epoch": 0.3671727040432887, "flos": 60772743342720.0, "grad_norm": 0.6921927745874564, "language_loss": 0.59176284, "learning_rate": 2.9213326374203426e-06, "loss": 0.60965312, "num_input_tokens_seen": 131232900, "step": 6107, "time_per_iteration": 3.2754647731781006 }, { "auxiliary_loss_clip": 0.01111272, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.04770529, "balance_loss_mlp": 1.02058864, "epoch": 0.3672328272959567, "flos": 18661303745280.0, "grad_norm": 1.8102661289525128, "language_loss": 0.74492711, "learning_rate": 2.92098694412469e-06, "loss": 0.76639688, "num_input_tokens_seen": 131250920, "step": 6108, "time_per_iteration": 2.730562448501587 }, { "auxiliary_loss_clip": 0.01129123, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.04957151, "balance_loss_mlp": 1.02196801, "epoch": 0.3672929505486247, "flos": 15049049535360.0, "grad_norm": 2.04949693656995, "language_loss": 0.72790694, "learning_rate": 2.9206412159051213e-06, "loss": 0.7495752, "num_input_tokens_seen": 131267910, "step": 6109, "time_per_iteration": 2.6488542556762695 }, { "auxiliary_loss_clip": 0.01065451, "auxiliary_loss_mlp": 0.01040533, "balance_loss_clip": 1.04156637, "balance_loss_mlp": 1.02426052, "epoch": 0.3673530738012927, "flos": 20589347099520.0, "grad_norm": 4.856830375229604, "language_loss": 0.53295934, "learning_rate": 2.920295452774744e-06, "loss": 0.55401909, "num_input_tokens_seen": 131287150, "step": 6110, "time_per_iteration": 2.8366596698760986 }, { "auxiliary_loss_clip": 0.01123878, "auxiliary_loss_mlp": 0.01039006, "balance_loss_clip": 1.04783487, "balance_loss_mlp": 1.02253747, "epoch": 0.36741319705396064, "flos": 21689830033920.0, "grad_norm": 1.6516494205850427, "language_loss": 0.80507129, "learning_rate": 2.919949654746672e-06, "loss": 0.82670015, "num_input_tokens_seen": 131308225, "step": 6111, "time_per_iteration": 2.7537708282470703 }, { "auxiliary_loss_clip": 0.01083524, "auxiliary_loss_mlp": 0.01044306, "balance_loss_clip": 1.04381704, "balance_loss_mlp": 1.02897525, "epoch": 0.3674733203066286, "flos": 29862200499840.0, "grad_norm": 1.7980410764958656, "language_loss": 0.72401643, "learning_rate": 2.9196038218340163e-06, "loss": 0.74529469, "num_input_tokens_seen": 131332115, "step": 6112, "time_per_iteration": 2.80513858795166 }, { "auxiliary_loss_clip": 0.0112775, "auxiliary_loss_mlp": 0.01046215, "balance_loss_clip": 1.05025816, "balance_loss_mlp": 1.03102732, "epoch": 0.36753344355929657, "flos": 18257021193600.0, "grad_norm": 1.6179233760027578, "language_loss": 0.8539387, "learning_rate": 2.919257954049892e-06, "loss": 0.8756783, "num_input_tokens_seen": 131351885, "step": 6113, "time_per_iteration": 2.6997315883636475 }, { "auxiliary_loss_clip": 0.01128342, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.04813516, "balance_loss_mlp": 1.02512193, "epoch": 0.36759356681196453, "flos": 25301150490240.0, "grad_norm": 2.2420277636185872, "language_loss": 0.78542709, "learning_rate": 2.918912051407413e-06, "loss": 0.807127, "num_input_tokens_seen": 131370245, "step": 6114, "time_per_iteration": 2.694831609725952 }, { "auxiliary_loss_clip": 0.01133627, "auxiliary_loss_mlp": 0.01044455, "balance_loss_clip": 1.05145383, "balance_loss_mlp": 1.02612031, "epoch": 0.3676536900646325, "flos": 21032952065280.0, "grad_norm": 1.6750895304816946, "language_loss": 0.67368686, "learning_rate": 2.918566113919698e-06, "loss": 0.69546771, "num_input_tokens_seen": 131388115, "step": 6115, "time_per_iteration": 2.6966724395751953 }, { "auxiliary_loss_clip": 0.01104674, "auxiliary_loss_mlp": 0.01037383, "balance_loss_clip": 1.04332471, "balance_loss_mlp": 1.02229142, "epoch": 0.36771381331730046, "flos": 16288506190080.0, "grad_norm": 3.500949938115168, "language_loss": 0.76685899, "learning_rate": 2.9182201415998636e-06, "loss": 0.78827953, "num_input_tokens_seen": 131404595, "step": 6116, "time_per_iteration": 2.6796109676361084 }, { "auxiliary_loss_clip": 0.01088778, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.04433835, "balance_loss_mlp": 1.02729988, "epoch": 0.36777393656996843, "flos": 22309971367680.0, "grad_norm": 1.7533988300226562, "language_loss": 0.62997502, "learning_rate": 2.9178741344610286e-06, "loss": 0.65129328, "num_input_tokens_seen": 131423760, "step": 6117, "time_per_iteration": 2.7784011363983154 }, { "auxiliary_loss_clip": 0.01103848, "auxiliary_loss_mlp": 0.01037351, "balance_loss_clip": 1.04275632, "balance_loss_mlp": 1.0210557, "epoch": 0.3678340598226364, "flos": 26834069260800.0, "grad_norm": 1.9867834860772036, "language_loss": 0.73087811, "learning_rate": 2.9175280925163156e-06, "loss": 0.75229007, "num_input_tokens_seen": 131444955, "step": 6118, "time_per_iteration": 2.734731674194336 }, { "auxiliary_loss_clip": 0.01132746, "auxiliary_loss_mlp": 0.01043898, "balance_loss_clip": 1.05198336, "balance_loss_mlp": 1.0266242, "epoch": 0.36789418307530436, "flos": 21761723105280.0, "grad_norm": 2.319960114880422, "language_loss": 0.72638988, "learning_rate": 2.9171820157788445e-06, "loss": 0.74815631, "num_input_tokens_seen": 131465720, "step": 6119, "time_per_iteration": 2.7073371410369873 }, { "auxiliary_loss_clip": 0.0111183, "auxiliary_loss_mlp": 0.01037904, "balance_loss_clip": 1.04830384, "balance_loss_mlp": 1.02101171, "epoch": 0.3679543063279723, "flos": 15924192497280.0, "grad_norm": 1.9587818101138383, "language_loss": 0.80524689, "learning_rate": 2.9168359042617404e-06, "loss": 0.8267442, "num_input_tokens_seen": 131483080, "step": 6120, "time_per_iteration": 2.679933547973633 }, { "auxiliary_loss_clip": 0.01093981, "auxiliary_loss_mlp": 0.0104441, "balance_loss_clip": 1.04785204, "balance_loss_mlp": 1.02894819, "epoch": 0.3680144295806403, "flos": 24275541456000.0, "grad_norm": 2.4092121945194496, "language_loss": 0.64745319, "learning_rate": 2.916489757978126e-06, "loss": 0.66883707, "num_input_tokens_seen": 131502545, "step": 6121, "time_per_iteration": 2.7067880630493164 }, { "auxiliary_loss_clip": 0.01126101, "auxiliary_loss_mlp": 0.01043212, "balance_loss_clip": 1.05021691, "balance_loss_mlp": 1.02735114, "epoch": 0.36807455283330826, "flos": 26104148985600.0, "grad_norm": 1.774708172393826, "language_loss": 0.71686751, "learning_rate": 2.9161435769411286e-06, "loss": 0.73856068, "num_input_tokens_seen": 131522155, "step": 6122, "time_per_iteration": 4.026647329330444 }, { "auxiliary_loss_clip": 0.01106964, "auxiliary_loss_mlp": 0.01043545, "balance_loss_clip": 1.04859734, "balance_loss_mlp": 1.0265938, "epoch": 0.3681346760859763, "flos": 24644990793600.0, "grad_norm": 5.6855406070233245, "language_loss": 0.69653022, "learning_rate": 2.915797361163875e-06, "loss": 0.71803534, "num_input_tokens_seen": 131543865, "step": 6123, "time_per_iteration": 2.7548627853393555 }, { "auxiliary_loss_clip": 0.01128204, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.04822993, "balance_loss_mlp": 1.02251744, "epoch": 0.36819479933864424, "flos": 23878369797120.0, "grad_norm": 7.022932421262019, "language_loss": 0.73640841, "learning_rate": 2.9154511106594933e-06, "loss": 0.75809622, "num_input_tokens_seen": 131562155, "step": 6124, "time_per_iteration": 2.6710870265960693 }, { "auxiliary_loss_clip": 0.01116833, "auxiliary_loss_mlp": 0.01045789, "balance_loss_clip": 1.04977059, "balance_loss_mlp": 1.02809882, "epoch": 0.3682549225913122, "flos": 25553997302400.0, "grad_norm": 1.931714997280456, "language_loss": 0.74334198, "learning_rate": 2.915104825441114e-06, "loss": 0.76496822, "num_input_tokens_seen": 131581695, "step": 6125, "time_per_iteration": 4.175686359405518 }, { "auxiliary_loss_clip": 0.01132649, "auxiliary_loss_mlp": 0.01053205, "balance_loss_clip": 1.05193818, "balance_loss_mlp": 1.03514445, "epoch": 0.36831504584398017, "flos": 16946605221120.0, "grad_norm": 1.8318884745506827, "language_loss": 0.78127813, "learning_rate": 2.9147585055218686e-06, "loss": 0.80313659, "num_input_tokens_seen": 131599465, "step": 6126, "time_per_iteration": 2.6783266067504883 }, { "auxiliary_loss_clip": 0.01128437, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.0490706, "balance_loss_mlp": 1.02659082, "epoch": 0.36837516909664814, "flos": 19865065259520.0, "grad_norm": 2.7490159956422575, "language_loss": 0.66118228, "learning_rate": 2.914412150914888e-06, "loss": 0.68291688, "num_input_tokens_seen": 131618330, "step": 6127, "time_per_iteration": 4.20530891418457 }, { "auxiliary_loss_clip": 0.01120142, "auxiliary_loss_mlp": 0.01046706, "balance_loss_clip": 1.05205703, "balance_loss_mlp": 1.02980185, "epoch": 0.3684352923493161, "flos": 37626984362880.0, "grad_norm": 1.8515813315176315, "language_loss": 0.70152593, "learning_rate": 2.9140657616333074e-06, "loss": 0.72319436, "num_input_tokens_seen": 131638960, "step": 6128, "time_per_iteration": 4.498606204986572 }, { "auxiliary_loss_clip": 0.0112131, "auxiliary_loss_mlp": 0.01046424, "balance_loss_clip": 1.05264103, "balance_loss_mlp": 1.02957964, "epoch": 0.36849541560198407, "flos": 14465501182080.0, "grad_norm": 2.3245894967836698, "language_loss": 0.75067866, "learning_rate": 2.9137193376902614e-06, "loss": 0.77235603, "num_input_tokens_seen": 131657440, "step": 6129, "time_per_iteration": 2.6874284744262695 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.04533887, "balance_loss_mlp": 1.02403355, "epoch": 0.36855553885465203, "flos": 25770753924480.0, "grad_norm": 1.6533761140504426, "language_loss": 0.84758681, "learning_rate": 2.9133728790988868e-06, "loss": 0.86918116, "num_input_tokens_seen": 131678035, "step": 6130, "time_per_iteration": 2.729963541030884 }, { "auxiliary_loss_clip": 0.0102639, "auxiliary_loss_mlp": 0.01017875, "balance_loss_clip": 1.02295637, "balance_loss_mlp": 1.01620567, "epoch": 0.36861566210732, "flos": 65049417377280.0, "grad_norm": 0.8481176099425293, "language_loss": 0.60254776, "learning_rate": 2.913026385872321e-06, "loss": 0.62299049, "num_input_tokens_seen": 131742470, "step": 6131, "time_per_iteration": 3.2806124687194824 }, { "auxiliary_loss_clip": 0.01097122, "auxiliary_loss_mlp": 0.01035652, "balance_loss_clip": 1.04542315, "balance_loss_mlp": 1.01914179, "epoch": 0.36867578535998796, "flos": 30954495133440.0, "grad_norm": 1.5587449528822306, "language_loss": 0.73085582, "learning_rate": 2.9126798580237034e-06, "loss": 0.75218356, "num_input_tokens_seen": 131764570, "step": 6132, "time_per_iteration": 2.781385898590088 }, { "auxiliary_loss_clip": 0.01127214, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.04795551, "balance_loss_mlp": 1.02187514, "epoch": 0.3687359086126559, "flos": 28837956182400.0, "grad_norm": 1.9292425463255205, "language_loss": 0.74192035, "learning_rate": 2.9123332955661736e-06, "loss": 0.76358628, "num_input_tokens_seen": 131785720, "step": 6133, "time_per_iteration": 2.718660831451416 }, { "auxiliary_loss_clip": 0.01072831, "auxiliary_loss_mlp": 0.01049093, "balance_loss_clip": 1.041502, "balance_loss_mlp": 1.03042495, "epoch": 0.3687960318653239, "flos": 21396798881280.0, "grad_norm": 1.8863128538280483, "language_loss": 0.71522588, "learning_rate": 2.911986698512874e-06, "loss": 0.73644507, "num_input_tokens_seen": 131804430, "step": 6134, "time_per_iteration": 2.8003294467926025 }, { "auxiliary_loss_clip": 0.01102901, "auxiliary_loss_mlp": 0.01034768, "balance_loss_clip": 1.0472008, "balance_loss_mlp": 1.01838863, "epoch": 0.36885615511799186, "flos": 20266043760000.0, "grad_norm": 1.6906065874809195, "language_loss": 0.75386798, "learning_rate": 2.9116400668769477e-06, "loss": 0.77524465, "num_input_tokens_seen": 131822060, "step": 6135, "time_per_iteration": 2.7916624546051025 }, { "auxiliary_loss_clip": 0.01030435, "auxiliary_loss_mlp": 0.01019879, "balance_loss_clip": 1.0281316, "balance_loss_mlp": 1.01760185, "epoch": 0.3689162783706599, "flos": 63088836301440.0, "grad_norm": 0.8159837123545765, "language_loss": 0.58766222, "learning_rate": 2.9112934006715376e-06, "loss": 0.60816532, "num_input_tokens_seen": 131880715, "step": 6136, "time_per_iteration": 3.2766408920288086 }, { "auxiliary_loss_clip": 0.01106354, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.04497695, "balance_loss_mlp": 1.02723718, "epoch": 0.36897640162332784, "flos": 10961984419200.0, "grad_norm": 2.3780452593473393, "language_loss": 0.79126394, "learning_rate": 2.9109466999097918e-06, "loss": 0.81276655, "num_input_tokens_seen": 131895850, "step": 6137, "time_per_iteration": 2.8411052227020264 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.01043272, "balance_loss_clip": 1.04803205, "balance_loss_mlp": 1.02645159, "epoch": 0.3690365248759958, "flos": 20704297599360.0, "grad_norm": 2.0312275113078337, "language_loss": 0.7454071, "learning_rate": 2.9105999646048552e-06, "loss": 0.76710081, "num_input_tokens_seen": 131915775, "step": 6138, "time_per_iteration": 2.7210230827331543 }, { "auxiliary_loss_clip": 0.01090918, "auxiliary_loss_mlp": 0.01042472, "balance_loss_clip": 1.04320955, "balance_loss_mlp": 1.0259856, "epoch": 0.3690966481286638, "flos": 31826369957760.0, "grad_norm": 2.0947758027881767, "language_loss": 0.64676917, "learning_rate": 2.9102531947698764e-06, "loss": 0.66810304, "num_input_tokens_seen": 131935715, "step": 6139, "time_per_iteration": 2.8667304515838623 }, { "auxiliary_loss_clip": 0.01095075, "auxiliary_loss_mlp": 0.01042873, "balance_loss_clip": 1.04443955, "balance_loss_mlp": 1.02646971, "epoch": 0.36915677138133174, "flos": 13114936782720.0, "grad_norm": 2.1146776737326998, "language_loss": 0.71764016, "learning_rate": 2.909906390418006e-06, "loss": 0.73901963, "num_input_tokens_seen": 131954120, "step": 6140, "time_per_iteration": 2.718100070953369 }, { "auxiliary_loss_clip": 0.01017799, "auxiliary_loss_mlp": 0.01004631, "balance_loss_clip": 1.02079976, "balance_loss_mlp": 1.00281894, "epoch": 0.3692168946339997, "flos": 68686879956480.0, "grad_norm": 0.7503567012350645, "language_loss": 0.59252203, "learning_rate": 2.9095595515623934e-06, "loss": 0.61274636, "num_input_tokens_seen": 132017485, "step": 6141, "time_per_iteration": 3.3003833293914795 }, { "auxiliary_loss_clip": 0.01122088, "auxiliary_loss_mlp": 0.01040836, "balance_loss_clip": 1.04716861, "balance_loss_mlp": 1.02458787, "epoch": 0.36927701788666767, "flos": 22017873968640.0, "grad_norm": 1.900744005055956, "language_loss": 0.75374687, "learning_rate": 2.909212678216192e-06, "loss": 0.77537608, "num_input_tokens_seen": 132036760, "step": 6142, "time_per_iteration": 2.707676410675049 }, { "auxiliary_loss_clip": 0.01122008, "auxiliary_loss_mlp": 0.01037683, "balance_loss_clip": 1.04708242, "balance_loss_mlp": 1.02276349, "epoch": 0.36933714113933563, "flos": 21835591424640.0, "grad_norm": 2.0868371024046346, "language_loss": 0.77474618, "learning_rate": 2.908865770392555e-06, "loss": 0.79634303, "num_input_tokens_seen": 132056935, "step": 6143, "time_per_iteration": 2.6308929920196533 }, { "auxiliary_loss_clip": 0.01122961, "auxiliary_loss_mlp": 0.01033227, "balance_loss_clip": 1.04840302, "balance_loss_mlp": 1.01860011, "epoch": 0.3693972643920036, "flos": 23691705793920.0, "grad_norm": 2.7754530777388555, "language_loss": 0.82127941, "learning_rate": 2.9085188281046364e-06, "loss": 0.84284127, "num_input_tokens_seen": 132077285, "step": 6144, "time_per_iteration": 2.7094409465789795 }, { "auxiliary_loss_clip": 0.01126238, "auxiliary_loss_mlp": 0.01040495, "balance_loss_clip": 1.0479883, "balance_loss_mlp": 1.02547419, "epoch": 0.36945738764467156, "flos": 22856747172480.0, "grad_norm": 2.260022101229928, "language_loss": 0.774791, "learning_rate": 2.908171851365593e-06, "loss": 0.79645836, "num_input_tokens_seen": 132095520, "step": 6145, "time_per_iteration": 2.6951241493225098 }, { "auxiliary_loss_clip": 0.01120499, "auxiliary_loss_mlp": 0.01030806, "balance_loss_clip": 1.04903388, "balance_loss_mlp": 1.01503491, "epoch": 0.36951751089733953, "flos": 16615939593600.0, "grad_norm": 2.2611713814894423, "language_loss": 0.76861286, "learning_rate": 2.9078248401885815e-06, "loss": 0.79012597, "num_input_tokens_seen": 132112810, "step": 6146, "time_per_iteration": 2.6205246448516846 }, { "auxiliary_loss_clip": 0.0110988, "auxiliary_loss_mlp": 0.01042802, "balance_loss_clip": 1.04717457, "balance_loss_mlp": 1.02518249, "epoch": 0.3695776341500075, "flos": 18914545607040.0, "grad_norm": 3.3549376840260394, "language_loss": 0.80945081, "learning_rate": 2.907477794586761e-06, "loss": 0.83097762, "num_input_tokens_seen": 132131615, "step": 6147, "time_per_iteration": 2.7176942825317383 }, { "auxiliary_loss_clip": 0.01108097, "auxiliary_loss_mlp": 0.00773519, "balance_loss_clip": 1.05041718, "balance_loss_mlp": 1.00029731, "epoch": 0.36963775740267546, "flos": 20808474019200.0, "grad_norm": 1.8104892137163535, "language_loss": 0.83325249, "learning_rate": 2.9071307145732926e-06, "loss": 0.85206866, "num_input_tokens_seen": 132149585, "step": 6148, "time_per_iteration": 2.7764229774475098 }, { "auxiliary_loss_clip": 0.01121751, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.04946411, "balance_loss_mlp": 1.01843238, "epoch": 0.3696978806553435, "flos": 26061881656320.0, "grad_norm": 2.472295207741171, "language_loss": 0.74167144, "learning_rate": 2.9067836001613357e-06, "loss": 0.76322597, "num_input_tokens_seen": 132165555, "step": 6149, "time_per_iteration": 2.729785680770874 }, { "auxiliary_loss_clip": 0.01141043, "auxiliary_loss_mlp": 0.01040796, "balance_loss_clip": 1.0524776, "balance_loss_mlp": 1.02347541, "epoch": 0.36975800390801145, "flos": 26833925606400.0, "grad_norm": 2.18045381202803, "language_loss": 0.71229833, "learning_rate": 2.906436451364054e-06, "loss": 0.73411667, "num_input_tokens_seen": 132185100, "step": 6150, "time_per_iteration": 2.6558914184570312 }, { "auxiliary_loss_clip": 0.01112432, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04834723, "balance_loss_mlp": 1.02634454, "epoch": 0.3698181271606794, "flos": 21142623265920.0, "grad_norm": 2.1283605732632487, "language_loss": 0.82001126, "learning_rate": 2.906089268194611e-06, "loss": 0.84155917, "num_input_tokens_seen": 132203930, "step": 6151, "time_per_iteration": 2.811908483505249 }, { "auxiliary_loss_clip": 0.0104085, "auxiliary_loss_mlp": 0.01012111, "balance_loss_clip": 1.02895284, "balance_loss_mlp": 1.01035905, "epoch": 0.3698782504133474, "flos": 66742639568640.0, "grad_norm": 0.8434423047890295, "language_loss": 0.63103437, "learning_rate": 2.9057420506661726e-06, "loss": 0.651564, "num_input_tokens_seen": 132263845, "step": 6152, "time_per_iteration": 3.283348798751831 }, { "auxiliary_loss_clip": 0.01083912, "auxiliary_loss_mlp": 0.01046371, "balance_loss_clip": 1.04603028, "balance_loss_mlp": 1.02939606, "epoch": 0.36993837366601534, "flos": 24311523905280.0, "grad_norm": 2.101714417244525, "language_loss": 0.70249707, "learning_rate": 2.9053947987919044e-06, "loss": 0.72379988, "num_input_tokens_seen": 132282350, "step": 6153, "time_per_iteration": 2.776003837585449 }, { "auxiliary_loss_clip": 0.01126735, "auxiliary_loss_mlp": 0.01038393, "balance_loss_clip": 1.04984677, "balance_loss_mlp": 1.02176309, "epoch": 0.3699984969186833, "flos": 24349194293760.0, "grad_norm": 1.5983560512083512, "language_loss": 0.72364891, "learning_rate": 2.9050475125849755e-06, "loss": 0.74530017, "num_input_tokens_seen": 132301930, "step": 6154, "time_per_iteration": 2.7031455039978027 }, { "auxiliary_loss_clip": 0.01108862, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.04792106, "balance_loss_mlp": 1.02376008, "epoch": 0.37005862017135127, "flos": 19829154637440.0, "grad_norm": 1.6579101756116525, "language_loss": 0.67716074, "learning_rate": 2.9047001920585534e-06, "loss": 0.6986388, "num_input_tokens_seen": 132320915, "step": 6155, "time_per_iteration": 2.7716591358184814 }, { "auxiliary_loss_clip": 0.01124062, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.04789114, "balance_loss_mlp": 1.0171442, "epoch": 0.37011874342401924, "flos": 19573793873280.0, "grad_norm": 1.797024775246088, "language_loss": 0.68048114, "learning_rate": 2.9043528372258097e-06, "loss": 0.70204842, "num_input_tokens_seen": 132340415, "step": 6156, "time_per_iteration": 2.7830615043640137 }, { "auxiliary_loss_clip": 0.01109781, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.04603815, "balance_loss_mlp": 1.02202225, "epoch": 0.3701788666766872, "flos": 20374350243840.0, "grad_norm": 1.8485807917443284, "language_loss": 0.82232833, "learning_rate": 2.904005448099916e-06, "loss": 0.84379458, "num_input_tokens_seen": 132358600, "step": 6157, "time_per_iteration": 2.676429033279419 }, { "auxiliary_loss_clip": 0.01087924, "auxiliary_loss_mlp": 0.01042208, "balance_loss_clip": 1.04360199, "balance_loss_mlp": 1.02474344, "epoch": 0.37023898992935517, "flos": 15340931452800.0, "grad_norm": 2.2992188770836175, "language_loss": 0.76899838, "learning_rate": 2.9036580246940444e-06, "loss": 0.79029977, "num_input_tokens_seen": 132373160, "step": 6158, "time_per_iteration": 2.7764365673065186 }, { "auxiliary_loss_clip": 0.01138492, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.0489651, "balance_loss_mlp": 1.01997483, "epoch": 0.37029911318202313, "flos": 19573937527680.0, "grad_norm": 2.8360595009252196, "language_loss": 0.68930852, "learning_rate": 2.9033105670213708e-06, "loss": 0.71106398, "num_input_tokens_seen": 132392345, "step": 6159, "time_per_iteration": 2.664858818054199 }, { "auxiliary_loss_clip": 0.01110756, "auxiliary_loss_mlp": 0.01035031, "balance_loss_clip": 1.049088, "balance_loss_mlp": 1.02067792, "epoch": 0.3703592364346911, "flos": 26213353309440.0, "grad_norm": 2.9956624327703523, "language_loss": 0.71067882, "learning_rate": 2.9029630750950697e-06, "loss": 0.73213673, "num_input_tokens_seen": 132412620, "step": 6160, "time_per_iteration": 2.757081985473633 }, { "auxiliary_loss_clip": 0.01106906, "auxiliary_loss_mlp": 0.01033059, "balance_loss_clip": 1.04698467, "balance_loss_mlp": 1.01918936, "epoch": 0.37041935968735906, "flos": 20048317470720.0, "grad_norm": 2.0439504076987403, "language_loss": 0.79205775, "learning_rate": 2.9026155489283176e-06, "loss": 0.81345737, "num_input_tokens_seen": 132431570, "step": 6161, "time_per_iteration": 2.8008711338043213 }, { "auxiliary_loss_clip": 0.01136197, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.04960537, "balance_loss_mlp": 1.02284193, "epoch": 0.3704794829400271, "flos": 24133802388480.0, "grad_norm": 2.0425786778899058, "language_loss": 0.79665029, "learning_rate": 2.902267988534295e-06, "loss": 0.81840169, "num_input_tokens_seen": 132451525, "step": 6162, "time_per_iteration": 4.2554450035095215 }, { "auxiliary_loss_clip": 0.01107039, "auxiliary_loss_mlp": 0.00773743, "balance_loss_clip": 1.0442729, "balance_loss_mlp": 1.00038123, "epoch": 0.37053960619269505, "flos": 14866874732160.0, "grad_norm": 2.0272159369395193, "language_loss": 0.79314882, "learning_rate": 2.9019203939261783e-06, "loss": 0.81195664, "num_input_tokens_seen": 132469875, "step": 6163, "time_per_iteration": 2.753324508666992 }, { "auxiliary_loss_clip": 0.0112147, "auxiliary_loss_mlp": 0.01039825, "balance_loss_clip": 1.04676855, "balance_loss_mlp": 1.02351689, "epoch": 0.370599729445363, "flos": 21361498790400.0, "grad_norm": 1.847799951808159, "language_loss": 0.67843366, "learning_rate": 2.9015727651171507e-06, "loss": 0.7000466, "num_input_tokens_seen": 132488360, "step": 6164, "time_per_iteration": 2.7885541915893555 }, { "auxiliary_loss_clip": 0.01109766, "auxiliary_loss_mlp": 0.01045808, "balance_loss_clip": 1.04918885, "balance_loss_mlp": 1.02877307, "epoch": 0.370659852698031, "flos": 26829041356800.0, "grad_norm": 2.0007288653084334, "language_loss": 0.83441198, "learning_rate": 2.9012251021203935e-06, "loss": 0.85596776, "num_input_tokens_seen": 132508630, "step": 6165, "time_per_iteration": 4.3637871742248535 }, { "auxiliary_loss_clip": 0.01115767, "auxiliary_loss_mlp": 0.01037848, "balance_loss_clip": 1.0473845, "balance_loss_mlp": 1.02026439, "epoch": 0.37071997595069894, "flos": 19099018880640.0, "grad_norm": 1.7502292049636352, "language_loss": 0.69057518, "learning_rate": 2.9008774049490896e-06, "loss": 0.71211129, "num_input_tokens_seen": 132527465, "step": 6166, "time_per_iteration": 2.6754019260406494 }, { "auxiliary_loss_clip": 0.01032616, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 1.03081024, "balance_loss_mlp": 1.02362847, "epoch": 0.3707800992033669, "flos": 52178384920320.0, "grad_norm": 0.8028866408552083, "language_loss": 0.5688796, "learning_rate": 2.9005296736164244e-06, "loss": 0.58946037, "num_input_tokens_seen": 132579940, "step": 6167, "time_per_iteration": 6.357440233230591 }, { "auxiliary_loss_clip": 0.01110244, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.04592001, "balance_loss_mlp": 1.02284551, "epoch": 0.3708402224560349, "flos": 19901837808000.0, "grad_norm": 2.0812394742982203, "language_loss": 0.75159574, "learning_rate": 2.900181908135584e-06, "loss": 0.77307719, "num_input_tokens_seen": 132598390, "step": 6168, "time_per_iteration": 2.7107198238372803 }, { "auxiliary_loss_clip": 0.01117658, "auxiliary_loss_mlp": 0.00773774, "balance_loss_clip": 1.04381216, "balance_loss_mlp": 1.00029826, "epoch": 0.37090034570870284, "flos": 20007630339840.0, "grad_norm": 2.166706099657804, "language_loss": 0.73690271, "learning_rate": 2.899834108519755e-06, "loss": 0.755817, "num_input_tokens_seen": 132616920, "step": 6169, "time_per_iteration": 2.743741035461426 }, { "auxiliary_loss_clip": 0.0113208, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.0476737, "balance_loss_mlp": 1.02352989, "epoch": 0.3709604689613708, "flos": 24134700228480.0, "grad_norm": 1.6724632615545945, "language_loss": 0.79498589, "learning_rate": 2.899486274782127e-06, "loss": 0.81669056, "num_input_tokens_seen": 132637660, "step": 6170, "time_per_iteration": 2.738492727279663 }, { "auxiliary_loss_clip": 0.01122253, "auxiliary_loss_mlp": 0.01045679, "balance_loss_clip": 1.04780805, "balance_loss_mlp": 1.02913237, "epoch": 0.37102059221403877, "flos": 23876071326720.0, "grad_norm": 1.739457755704792, "language_loss": 0.76506341, "learning_rate": 2.8991384069358885e-06, "loss": 0.78674281, "num_input_tokens_seen": 132657635, "step": 6171, "time_per_iteration": 2.6531472206115723 }, { "auxiliary_loss_clip": 0.01112543, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.05081654, "balance_loss_mlp": 1.02546144, "epoch": 0.37108071546670673, "flos": 14501268149760.0, "grad_norm": 2.0084032146250608, "language_loss": 0.80705774, "learning_rate": 2.898790504994232e-06, "loss": 0.82860184, "num_input_tokens_seen": 132674455, "step": 6172, "time_per_iteration": 2.6587960720062256 }, { "auxiliary_loss_clip": 0.01125694, "auxiliary_loss_mlp": 0.01044257, "balance_loss_clip": 1.0475564, "balance_loss_mlp": 1.02747262, "epoch": 0.3711408387193747, "flos": 34562619279360.0, "grad_norm": 2.410153405618026, "language_loss": 0.59260982, "learning_rate": 2.89844256897035e-06, "loss": 0.61430931, "num_input_tokens_seen": 132695140, "step": 6173, "time_per_iteration": 2.738430976867676 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01044385, "balance_loss_clip": 1.04549873, "balance_loss_mlp": 1.02885222, "epoch": 0.37120096197204266, "flos": 17310703432320.0, "grad_norm": 1.954423749693878, "language_loss": 0.80869365, "learning_rate": 2.898094598877435e-06, "loss": 0.83024681, "num_input_tokens_seen": 132712470, "step": 6174, "time_per_iteration": 2.7166690826416016 }, { "auxiliary_loss_clip": 0.01129522, "auxiliary_loss_mlp": 0.01045042, "balance_loss_clip": 1.04628158, "balance_loss_mlp": 1.03025961, "epoch": 0.37126108522471063, "flos": 30664049760000.0, "grad_norm": 2.1592050046005, "language_loss": 0.79910219, "learning_rate": 2.8977465947286826e-06, "loss": 0.82084787, "num_input_tokens_seen": 132732945, "step": 6175, "time_per_iteration": 2.6746280193328857 }, { "auxiliary_loss_clip": 0.011267, "auxiliary_loss_mlp": 0.01053826, "balance_loss_clip": 1.05173898, "balance_loss_mlp": 1.0380547, "epoch": 0.37132120847737865, "flos": 25155640494720.0, "grad_norm": 2.2578092376668315, "language_loss": 0.88735723, "learning_rate": 2.89739855653729e-06, "loss": 0.90916252, "num_input_tokens_seen": 132752470, "step": 6176, "time_per_iteration": 2.6791093349456787 }, { "auxiliary_loss_clip": 0.01124216, "auxiliary_loss_mlp": 0.01042973, "balance_loss_clip": 1.04811859, "balance_loss_mlp": 1.02713037, "epoch": 0.3713813317300466, "flos": 21213474842880.0, "grad_norm": 1.5716198978013565, "language_loss": 0.73431349, "learning_rate": 2.8970504843164546e-06, "loss": 0.75598538, "num_input_tokens_seen": 132771485, "step": 6177, "time_per_iteration": 2.6808605194091797 }, { "auxiliary_loss_clip": 0.01102086, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.04524541, "balance_loss_mlp": 1.03575838, "epoch": 0.3714414549827146, "flos": 21616644072960.0, "grad_norm": 2.0030850547718915, "language_loss": 0.75349051, "learning_rate": 2.896702378079374e-06, "loss": 0.77503073, "num_input_tokens_seen": 132791465, "step": 6178, "time_per_iteration": 2.7112066745758057 }, { "auxiliary_loss_clip": 0.0107122, "auxiliary_loss_mlp": 0.01050415, "balance_loss_clip": 1.04323864, "balance_loss_mlp": 1.03208089, "epoch": 0.37150157823538255, "flos": 19972294335360.0, "grad_norm": 2.0305314414463136, "language_loss": 0.72141892, "learning_rate": 2.8963542378392502e-06, "loss": 0.74263525, "num_input_tokens_seen": 132810160, "step": 6179, "time_per_iteration": 2.7965877056121826 }, { "auxiliary_loss_clip": 0.01137504, "auxiliary_loss_mlp": 0.01046799, "balance_loss_clip": 1.05008841, "balance_loss_mlp": 1.03018165, "epoch": 0.3715617014880505, "flos": 24860562266880.0, "grad_norm": 2.387630814732786, "language_loss": 0.6993162, "learning_rate": 2.896006063609283e-06, "loss": 0.72115916, "num_input_tokens_seen": 132831265, "step": 6180, "time_per_iteration": 2.695232391357422 }, { "auxiliary_loss_clip": 0.01113448, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.04914021, "balance_loss_mlp": 1.02208257, "epoch": 0.3716218247407185, "flos": 20449080489600.0, "grad_norm": 2.1080005695464243, "language_loss": 0.77920252, "learning_rate": 2.8956578554026767e-06, "loss": 0.80070812, "num_input_tokens_seen": 132850005, "step": 6181, "time_per_iteration": 2.7087795734405518 }, { "auxiliary_loss_clip": 0.01123157, "auxiliary_loss_mlp": 0.01041815, "balance_loss_clip": 1.05016994, "balance_loss_mlp": 1.02525139, "epoch": 0.37168194799338644, "flos": 24133479166080.0, "grad_norm": 2.570629027716188, "language_loss": 0.79222846, "learning_rate": 2.8953096132326343e-06, "loss": 0.81387818, "num_input_tokens_seen": 132865790, "step": 6182, "time_per_iteration": 2.6541473865509033 }, { "auxiliary_loss_clip": 0.01041849, "auxiliary_loss_mlp": 0.01016945, "balance_loss_clip": 1.03053021, "balance_loss_mlp": 1.01533604, "epoch": 0.3717420712460544, "flos": 67408926900480.0, "grad_norm": 0.7830434308203498, "language_loss": 0.57445002, "learning_rate": 2.894961337112362e-06, "loss": 0.59503794, "num_input_tokens_seen": 132921775, "step": 6183, "time_per_iteration": 3.191969633102417 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.00775242, "balance_loss_clip": 1.04496169, "balance_loss_mlp": 1.00043631, "epoch": 0.37180219449872237, "flos": 22376908362240.0, "grad_norm": 1.9647478507461604, "language_loss": 0.76617277, "learning_rate": 2.894613027055066e-06, "loss": 0.78519297, "num_input_tokens_seen": 132941060, "step": 6184, "time_per_iteration": 2.7096588611602783 }, { "auxiliary_loss_clip": 0.01090654, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.04084587, "balance_loss_mlp": 1.02344596, "epoch": 0.37186231775139034, "flos": 21869885934720.0, "grad_norm": 2.1021072738728717, "language_loss": 0.7217713, "learning_rate": 2.894264683073954e-06, "loss": 0.74306846, "num_input_tokens_seen": 132961850, "step": 6185, "time_per_iteration": 2.739130735397339 }, { "auxiliary_loss_clip": 0.01081138, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.04156423, "balance_loss_mlp": 1.01805878, "epoch": 0.3719224410040583, "flos": 22415225195520.0, "grad_norm": 2.1871647895832496, "language_loss": 0.76805776, "learning_rate": 2.8939163051822363e-06, "loss": 0.78921413, "num_input_tokens_seen": 132981625, "step": 6186, "time_per_iteration": 2.779510259628296 }, { "auxiliary_loss_clip": 0.01131414, "auxiliary_loss_mlp": 0.01042221, "balance_loss_clip": 1.05090106, "balance_loss_mlp": 1.02491212, "epoch": 0.37198256425672627, "flos": 25151223121920.0, "grad_norm": 1.8929067887672733, "language_loss": 0.84037393, "learning_rate": 2.8935678933931224e-06, "loss": 0.86211032, "num_input_tokens_seen": 133001225, "step": 6187, "time_per_iteration": 2.67541241645813 }, { "auxiliary_loss_clip": 0.01120953, "auxiliary_loss_mlp": 0.01040882, "balance_loss_clip": 1.04474545, "balance_loss_mlp": 1.02553999, "epoch": 0.37204268750939423, "flos": 21138313633920.0, "grad_norm": 1.7194664317181616, "language_loss": 0.84831274, "learning_rate": 2.893219447719824e-06, "loss": 0.86993104, "num_input_tokens_seen": 133018820, "step": 6188, "time_per_iteration": 2.6241226196289062 }, { "auxiliary_loss_clip": 0.01108827, "auxiliary_loss_mlp": 0.01040814, "balance_loss_clip": 1.04934168, "balance_loss_mlp": 1.02501917, "epoch": 0.37210281076206225, "flos": 21506829217920.0, "grad_norm": 2.498329305558477, "language_loss": 0.65702367, "learning_rate": 2.8928709681755548e-06, "loss": 0.67852014, "num_input_tokens_seen": 133040205, "step": 6189, "time_per_iteration": 2.724707841873169 }, { "auxiliary_loss_clip": 0.01112219, "auxiliary_loss_mlp": 0.0104713, "balance_loss_clip": 1.0451889, "balance_loss_mlp": 1.03045225, "epoch": 0.3721629340147302, "flos": 17347835116800.0, "grad_norm": 1.9571366893805608, "language_loss": 0.84120989, "learning_rate": 2.8925224547735293e-06, "loss": 0.86280334, "num_input_tokens_seen": 133058095, "step": 6190, "time_per_iteration": 2.719454050064087 }, { "auxiliary_loss_clip": 0.01109992, "auxiliary_loss_mlp": 0.01041587, "balance_loss_clip": 1.0465343, "balance_loss_mlp": 1.02571416, "epoch": 0.3722230572673982, "flos": 16432400073600.0, "grad_norm": 4.021000090429005, "language_loss": 0.87807733, "learning_rate": 2.8921739075269633e-06, "loss": 0.89959311, "num_input_tokens_seen": 133071530, "step": 6191, "time_per_iteration": 2.7081027030944824 }, { "auxiliary_loss_clip": 0.0108777, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.04300189, "balance_loss_mlp": 1.01962125, "epoch": 0.37228318052006615, "flos": 22674716023680.0, "grad_norm": 3.7199150853096508, "language_loss": 0.74228656, "learning_rate": 2.891825326449073e-06, "loss": 0.7635442, "num_input_tokens_seen": 133091410, "step": 6192, "time_per_iteration": 2.8161356449127197 }, { "auxiliary_loss_clip": 0.01134777, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.04818201, "balance_loss_mlp": 1.02497888, "epoch": 0.3723433037727341, "flos": 25265491263360.0, "grad_norm": 2.31871347399746, "language_loss": 0.80621845, "learning_rate": 2.8914767115530766e-06, "loss": 0.82796752, "num_input_tokens_seen": 133110365, "step": 6193, "time_per_iteration": 2.661550760269165 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.01041083, "balance_loss_clip": 1.04354334, "balance_loss_mlp": 1.02522826, "epoch": 0.3724034270254021, "flos": 10524664333440.0, "grad_norm": 2.475173523724827, "language_loss": 0.84729886, "learning_rate": 2.891128062852194e-06, "loss": 0.86872447, "num_input_tokens_seen": 133128255, "step": 6194, "time_per_iteration": 2.711531400680542 }, { "auxiliary_loss_clip": 0.0111161, "auxiliary_loss_mlp": 0.010372, "balance_loss_clip": 1.04650784, "balance_loss_mlp": 1.02142286, "epoch": 0.37246355027807004, "flos": 20266223328000.0, "grad_norm": 9.44838101604173, "language_loss": 0.77016377, "learning_rate": 2.890779380359646e-06, "loss": 0.79165184, "num_input_tokens_seen": 133143975, "step": 6195, "time_per_iteration": 2.6527512073516846 }, { "auxiliary_loss_clip": 0.01112195, "auxiliary_loss_mlp": 0.0103539, "balance_loss_clip": 1.0468967, "balance_loss_mlp": 1.02030444, "epoch": 0.372523673530738, "flos": 19500571998720.0, "grad_norm": 1.7021548935758455, "language_loss": 0.79216856, "learning_rate": 2.890430664088655e-06, "loss": 0.81364441, "num_input_tokens_seen": 133162935, "step": 6196, "time_per_iteration": 2.6642892360687256 }, { "auxiliary_loss_clip": 0.01124648, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.04975688, "balance_loss_mlp": 1.0240953, "epoch": 0.372583796783406, "flos": 16764250849920.0, "grad_norm": 2.570886031241156, "language_loss": 0.83998835, "learning_rate": 2.890081914052443e-06, "loss": 0.8616184, "num_input_tokens_seen": 133181180, "step": 6197, "time_per_iteration": 2.627305030822754 }, { "auxiliary_loss_clip": 0.01131102, "auxiliary_loss_mlp": 0.01040963, "balance_loss_clip": 1.04697967, "balance_loss_mlp": 1.02488184, "epoch": 0.37264392003607394, "flos": 22637979388800.0, "grad_norm": 1.697216275583005, "language_loss": 0.64450538, "learning_rate": 2.889733130264237e-06, "loss": 0.66622603, "num_input_tokens_seen": 133199615, "step": 6198, "time_per_iteration": 2.606621503829956 }, { "auxiliary_loss_clip": 0.01120059, "auxiliary_loss_mlp": 0.01044451, "balance_loss_clip": 1.04676938, "balance_loss_mlp": 1.02959776, "epoch": 0.3727040432887419, "flos": 19973120348160.0, "grad_norm": 1.4273324893736263, "language_loss": 0.737185, "learning_rate": 2.889384312737261e-06, "loss": 0.75883007, "num_input_tokens_seen": 133219650, "step": 6199, "time_per_iteration": 2.78157901763916 }, { "auxiliary_loss_clip": 0.01105963, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.04564095, "balance_loss_mlp": 1.02154374, "epoch": 0.37276416654140987, "flos": 63899122279680.0, "grad_norm": 2.2948998309451905, "language_loss": 0.80481982, "learning_rate": 2.889035461484742e-06, "loss": 0.82624996, "num_input_tokens_seen": 133245675, "step": 6200, "time_per_iteration": 3.0623533725738525 }, { "auxiliary_loss_clip": 0.0109608, "auxiliary_loss_mlp": 0.01045798, "balance_loss_clip": 1.04552174, "balance_loss_mlp": 1.03016961, "epoch": 0.37282428979407783, "flos": 39785970211200.0, "grad_norm": 2.0774746879263746, "language_loss": 0.60494614, "learning_rate": 2.88868657651991e-06, "loss": 0.62636495, "num_input_tokens_seen": 133266905, "step": 6201, "time_per_iteration": 2.8960700035095215 }, { "auxiliary_loss_clip": 0.01125447, "auxiliary_loss_mlp": 0.01039384, "balance_loss_clip": 1.0489639, "balance_loss_mlp": 1.02346373, "epoch": 0.37288441304674586, "flos": 22709046447360.0, "grad_norm": 1.870117482164085, "language_loss": 0.72692698, "learning_rate": 2.8883376578559934e-06, "loss": 0.74857527, "num_input_tokens_seen": 133286865, "step": 6202, "time_per_iteration": 4.202298402786255 }, { "auxiliary_loss_clip": 0.01110741, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.04642594, "balance_loss_mlp": 1.01800799, "epoch": 0.3729445362994138, "flos": 18770292587520.0, "grad_norm": 2.0679450432005666, "language_loss": 0.74148834, "learning_rate": 2.8879887055062243e-06, "loss": 0.76292896, "num_input_tokens_seen": 133305295, "step": 6203, "time_per_iteration": 2.7268033027648926 }, { "auxiliary_loss_clip": 0.01106859, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.04595554, "balance_loss_mlp": 1.02524805, "epoch": 0.3730046595520818, "flos": 22456199635200.0, "grad_norm": 1.649450499506288, "language_loss": 0.81921744, "learning_rate": 2.8876397194838353e-06, "loss": 0.84067428, "num_input_tokens_seen": 133324625, "step": 6204, "time_per_iteration": 4.347074747085571 }, { "auxiliary_loss_clip": 0.01123916, "auxiliary_loss_mlp": 0.01044159, "balance_loss_clip": 1.04827762, "balance_loss_mlp": 1.02794707, "epoch": 0.37306478280474975, "flos": 24316372241280.0, "grad_norm": 1.675399556922802, "language_loss": 0.74961317, "learning_rate": 2.8872906998020577e-06, "loss": 0.77129394, "num_input_tokens_seen": 133344625, "step": 6205, "time_per_iteration": 2.66701602935791 }, { "auxiliary_loss_clip": 0.01117233, "auxiliary_loss_mlp": 0.01045323, "balance_loss_clip": 1.04337549, "balance_loss_mlp": 1.02857447, "epoch": 0.3731249060574177, "flos": 15815167741440.0, "grad_norm": 1.8318607259579, "language_loss": 0.7815854, "learning_rate": 2.886941646474128e-06, "loss": 0.80321097, "num_input_tokens_seen": 133363605, "step": 6206, "time_per_iteration": 4.202580451965332 }, { "auxiliary_loss_clip": 0.01134488, "auxiliary_loss_mlp": 0.01039926, "balance_loss_clip": 1.04804325, "balance_loss_mlp": 1.02317739, "epoch": 0.3731850293100857, "flos": 19828077229440.0, "grad_norm": 2.3232535418166256, "language_loss": 0.93322426, "learning_rate": 2.886592559513283e-06, "loss": 0.95496845, "num_input_tokens_seen": 133379405, "step": 6207, "time_per_iteration": 4.318574666976929 }, { "auxiliary_loss_clip": 0.01105421, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.0478878, "balance_loss_mlp": 1.01876843, "epoch": 0.37324515256275365, "flos": 19062354072960.0, "grad_norm": 3.0736568130228363, "language_loss": 0.82651198, "learning_rate": 2.886243438932759e-06, "loss": 0.8479048, "num_input_tokens_seen": 133397585, "step": 6208, "time_per_iteration": 2.749662160873413 }, { "auxiliary_loss_clip": 0.01122225, "auxiliary_loss_mlp": 0.0103968, "balance_loss_clip": 1.04488516, "balance_loss_mlp": 1.0223707, "epoch": 0.3733052758154216, "flos": 20704333512960.0, "grad_norm": 2.0157740087962845, "language_loss": 0.73122764, "learning_rate": 2.8858942847457953e-06, "loss": 0.75284666, "num_input_tokens_seen": 133415365, "step": 6209, "time_per_iteration": 2.6315791606903076 }, { "auxiliary_loss_clip": 0.01095649, "auxiliary_loss_mlp": 0.01037134, "balance_loss_clip": 1.04820108, "balance_loss_mlp": 1.02065969, "epoch": 0.3733653990680896, "flos": 20193504243840.0, "grad_norm": 1.9650719997143145, "language_loss": 0.70413053, "learning_rate": 2.8855450969656305e-06, "loss": 0.72545838, "num_input_tokens_seen": 133435700, "step": 6210, "time_per_iteration": 2.7484405040740967 }, { "auxiliary_loss_clip": 0.01072484, "auxiliary_loss_mlp": 0.01045611, "balance_loss_clip": 1.03769457, "balance_loss_mlp": 1.02674007, "epoch": 0.37342552232075754, "flos": 20339660684160.0, "grad_norm": 2.0510282142916427, "language_loss": 0.77773547, "learning_rate": 2.8851958756055073e-06, "loss": 0.79891646, "num_input_tokens_seen": 133455180, "step": 6211, "time_per_iteration": 2.706294536590576 }, { "auxiliary_loss_clip": 0.01122999, "auxiliary_loss_mlp": 0.01042393, "balance_loss_clip": 1.04602683, "balance_loss_mlp": 1.02645469, "epoch": 0.3734856455734255, "flos": 35517879527040.0, "grad_norm": 1.675173432335243, "language_loss": 0.73258781, "learning_rate": 2.884846620678668e-06, "loss": 0.7542417, "num_input_tokens_seen": 133476715, "step": 6212, "time_per_iteration": 2.788787841796875 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.01047595, "balance_loss_clip": 1.05055571, "balance_loss_mlp": 1.03106034, "epoch": 0.37354576882609347, "flos": 21142300043520.0, "grad_norm": 1.9808770110660865, "language_loss": 0.81656909, "learning_rate": 2.884497332198356e-06, "loss": 0.83835626, "num_input_tokens_seen": 133494550, "step": 6213, "time_per_iteration": 2.6829304695129395 }, { "auxiliary_loss_clip": 0.01089374, "auxiliary_loss_mlp": 0.01046172, "balance_loss_clip": 1.0412662, "balance_loss_mlp": 1.02843404, "epoch": 0.37360589207876144, "flos": 21506793304320.0, "grad_norm": 2.223600899112558, "language_loss": 0.78999674, "learning_rate": 2.8841480101778167e-06, "loss": 0.81135225, "num_input_tokens_seen": 133512640, "step": 6214, "time_per_iteration": 2.674373149871826 }, { "auxiliary_loss_clip": 0.01109052, "auxiliary_loss_mlp": 0.01044175, "balance_loss_clip": 1.04420567, "balance_loss_mlp": 1.02827835, "epoch": 0.37366601533142946, "flos": 38435800861440.0, "grad_norm": 1.9266500277332215, "language_loss": 0.84611148, "learning_rate": 2.883798654630296e-06, "loss": 0.86764371, "num_input_tokens_seen": 133535540, "step": 6215, "time_per_iteration": 2.8276026248931885 }, { "auxiliary_loss_clip": 0.01100197, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.04435837, "balance_loss_mlp": 1.02298141, "epoch": 0.3737261385840974, "flos": 18441171244800.0, "grad_norm": 1.8731663372997254, "language_loss": 0.67690969, "learning_rate": 2.8834492655690423e-06, "loss": 0.69830984, "num_input_tokens_seen": 133555795, "step": 6216, "time_per_iteration": 2.724090576171875 }, { "auxiliary_loss_clip": 0.01111654, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.045977, "balance_loss_mlp": 1.02578092, "epoch": 0.3737862618367654, "flos": 22929861306240.0, "grad_norm": 2.3172976096058853, "language_loss": 0.65993899, "learning_rate": 2.883099843007303e-06, "loss": 0.68148154, "num_input_tokens_seen": 133575905, "step": 6217, "time_per_iteration": 2.7126269340515137 }, { "auxiliary_loss_clip": 0.01115905, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.0483315, "balance_loss_mlp": 1.02264857, "epoch": 0.37384638508943335, "flos": 15409664127360.0, "grad_norm": 2.0273109551694777, "language_loss": 0.80449212, "learning_rate": 2.88275038695833e-06, "loss": 0.82604814, "num_input_tokens_seen": 133592585, "step": 6218, "time_per_iteration": 2.680894374847412 }, { "auxiliary_loss_clip": 0.01115539, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.04488862, "balance_loss_mlp": 1.01760781, "epoch": 0.3739065083421013, "flos": 24280820755200.0, "grad_norm": 1.5960804840892617, "language_loss": 0.78692639, "learning_rate": 2.8824008974353736e-06, "loss": 0.80841064, "num_input_tokens_seen": 133615070, "step": 6219, "time_per_iteration": 2.6683976650238037 }, { "auxiliary_loss_clip": 0.01107805, "auxiliary_loss_mlp": 0.01040758, "balance_loss_clip": 1.04602623, "balance_loss_mlp": 1.0247364, "epoch": 0.3739666315947693, "flos": 23002831785600.0, "grad_norm": 1.8103875982928064, "language_loss": 0.77023458, "learning_rate": 2.8820513744516866e-06, "loss": 0.79172027, "num_input_tokens_seen": 133633490, "step": 6220, "time_per_iteration": 2.670686960220337 }, { "auxiliary_loss_clip": 0.01105245, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.04717016, "balance_loss_mlp": 1.02473164, "epoch": 0.37402675484743725, "flos": 19391116279680.0, "grad_norm": 3.4153989861378204, "language_loss": 0.8298834, "learning_rate": 2.8817018180205235e-06, "loss": 0.85134745, "num_input_tokens_seen": 133653425, "step": 6221, "time_per_iteration": 2.730738401412964 }, { "auxiliary_loss_clip": 0.01108391, "auxiliary_loss_mlp": 0.01043965, "balance_loss_clip": 1.04499435, "balance_loss_mlp": 1.02825367, "epoch": 0.3740868781001052, "flos": 17126158331520.0, "grad_norm": 1.9668982067313725, "language_loss": 0.75944567, "learning_rate": 2.8813522281551387e-06, "loss": 0.78096926, "num_input_tokens_seen": 133670220, "step": 6222, "time_per_iteration": 2.62052321434021 }, { "auxiliary_loss_clip": 0.01103117, "auxiliary_loss_mlp": 0.00772891, "balance_loss_clip": 1.04785156, "balance_loss_mlp": 1.00029564, "epoch": 0.3741470013527732, "flos": 20043505048320.0, "grad_norm": 1.8881600065301847, "language_loss": 0.70621789, "learning_rate": 2.881002604868789e-06, "loss": 0.72497797, "num_input_tokens_seen": 133688910, "step": 6223, "time_per_iteration": 2.7686285972595215 }, { "auxiliary_loss_clip": 0.01104752, "auxiliary_loss_mlp": 0.01035203, "balance_loss_clip": 1.05155015, "balance_loss_mlp": 1.02057576, "epoch": 0.37420712460544114, "flos": 36897279569280.0, "grad_norm": 2.1852519558340644, "language_loss": 0.6875304, "learning_rate": 2.8806529481747325e-06, "loss": 0.7089299, "num_input_tokens_seen": 133708690, "step": 6224, "time_per_iteration": 2.817263126373291 }, { "auxiliary_loss_clip": 0.01091747, "auxiliary_loss_mlp": 0.01036393, "balance_loss_clip": 1.04859614, "balance_loss_mlp": 1.02059817, "epoch": 0.3742672478581091, "flos": 22201198007040.0, "grad_norm": 2.246642459489035, "language_loss": 0.70192593, "learning_rate": 2.880303258086228e-06, "loss": 0.72320735, "num_input_tokens_seen": 133728095, "step": 6225, "time_per_iteration": 2.785083532333374 }, { "auxiliary_loss_clip": 0.01088757, "auxiliary_loss_mlp": 0.01048544, "balance_loss_clip": 1.04366183, "balance_loss_mlp": 1.03175974, "epoch": 0.3743273711107771, "flos": 24681547860480.0, "grad_norm": 2.1768682992812236, "language_loss": 0.7896018, "learning_rate": 2.879953534616536e-06, "loss": 0.81097472, "num_input_tokens_seen": 133745590, "step": 6226, "time_per_iteration": 2.7403974533081055 }, { "auxiliary_loss_clip": 0.01105293, "auxiliary_loss_mlp": 0.01039029, "balance_loss_clip": 1.04631484, "balance_loss_mlp": 1.02303696, "epoch": 0.37438749436344504, "flos": 24459619680000.0, "grad_norm": 1.7825799805329443, "language_loss": 0.67965841, "learning_rate": 2.879603777778917e-06, "loss": 0.70110166, "num_input_tokens_seen": 133766155, "step": 6227, "time_per_iteration": 2.6975693702697754 }, { "auxiliary_loss_clip": 0.01099252, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.04493213, "balance_loss_mlp": 1.01890039, "epoch": 0.374447617616113, "flos": 21798747048960.0, "grad_norm": 1.9005486801766094, "language_loss": 0.829476, "learning_rate": 2.879253987586635e-06, "loss": 0.85081351, "num_input_tokens_seen": 133783185, "step": 6228, "time_per_iteration": 2.7754271030426025 }, { "auxiliary_loss_clip": 0.01090082, "auxiliary_loss_mlp": 0.01048677, "balance_loss_clip": 1.04396605, "balance_loss_mlp": 1.03159404, "epoch": 0.374507740868781, "flos": 17968191932160.0, "grad_norm": 1.6406992237121778, "language_loss": 0.74450547, "learning_rate": 2.8789041640529535e-06, "loss": 0.76589304, "num_input_tokens_seen": 133800975, "step": 6229, "time_per_iteration": 2.6378824710845947 }, { "auxiliary_loss_clip": 0.0109707, "auxiliary_loss_mlp": 0.01035996, "balance_loss_clip": 1.0470053, "balance_loss_mlp": 1.01971197, "epoch": 0.374567864121449, "flos": 16105828596480.0, "grad_norm": 2.127994694324029, "language_loss": 0.83782691, "learning_rate": 2.8785543071911383e-06, "loss": 0.85915756, "num_input_tokens_seen": 133818020, "step": 6230, "time_per_iteration": 2.6857657432556152 }, { "auxiliary_loss_clip": 0.0112393, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.04905128, "balance_loss_mlp": 1.02556968, "epoch": 0.37462798737411696, "flos": 25773160135680.0, "grad_norm": 2.8382818326589145, "language_loss": 0.735865, "learning_rate": 2.878204417014456e-06, "loss": 0.75752056, "num_input_tokens_seen": 133840690, "step": 6231, "time_per_iteration": 2.7082016468048096 }, { "auxiliary_loss_clip": 0.0112579, "auxiliary_loss_mlp": 0.01046917, "balance_loss_clip": 1.05376148, "balance_loss_mlp": 1.03075266, "epoch": 0.3746881106267849, "flos": 16654507822080.0, "grad_norm": 2.9683381932525665, "language_loss": 0.7412858, "learning_rate": 2.8778544935361735e-06, "loss": 0.76301289, "num_input_tokens_seen": 133858350, "step": 6232, "time_per_iteration": 2.5764057636260986 }, { "auxiliary_loss_clip": 0.01106131, "auxiliary_loss_mlp": 0.01039245, "balance_loss_clip": 1.04461622, "balance_loss_mlp": 1.02237701, "epoch": 0.3747482338794529, "flos": 26177981391360.0, "grad_norm": 2.121427790242168, "language_loss": 0.77296579, "learning_rate": 2.877504536769561e-06, "loss": 0.79441959, "num_input_tokens_seen": 133879775, "step": 6233, "time_per_iteration": 2.692286252975464 }, { "auxiliary_loss_clip": 0.01118513, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05093503, "balance_loss_mlp": 1.024593, "epoch": 0.37480835713212085, "flos": 12021061950720.0, "grad_norm": 1.8446337373318833, "language_loss": 0.69493848, "learning_rate": 2.8771545467278883e-06, "loss": 0.71652997, "num_input_tokens_seen": 133898295, "step": 6234, "time_per_iteration": 2.658332586288452 }, { "auxiliary_loss_clip": 0.01123531, "auxiliary_loss_mlp": 0.01042963, "balance_loss_clip": 1.04885483, "balance_loss_mlp": 1.02833033, "epoch": 0.3748684803847888, "flos": 19679263182720.0, "grad_norm": 1.9015387878630694, "language_loss": 0.82462788, "learning_rate": 2.8768045234244276e-06, "loss": 0.84629285, "num_input_tokens_seen": 133915230, "step": 6235, "time_per_iteration": 2.591198682785034 }, { "auxiliary_loss_clip": 0.01140927, "auxiliary_loss_mlp": 0.0103602, "balance_loss_clip": 1.05301189, "balance_loss_mlp": 1.02021289, "epoch": 0.3749286036374568, "flos": 20521189042560.0, "grad_norm": 1.8869628373328378, "language_loss": 0.78439927, "learning_rate": 2.8764544668724517e-06, "loss": 0.80616879, "num_input_tokens_seen": 133934110, "step": 6236, "time_per_iteration": 2.6754372119903564 }, { "auxiliary_loss_clip": 0.01118225, "auxiliary_loss_mlp": 0.01050242, "balance_loss_clip": 1.04519606, "balance_loss_mlp": 1.03202713, "epoch": 0.37498872689012475, "flos": 20704620821760.0, "grad_norm": 2.0770406770017242, "language_loss": 0.74357057, "learning_rate": 2.876104377085234e-06, "loss": 0.76525521, "num_input_tokens_seen": 133952395, "step": 6237, "time_per_iteration": 2.6760342121124268 }, { "auxiliary_loss_clip": 0.01114513, "auxiliary_loss_mlp": 0.00773766, "balance_loss_clip": 1.04626942, "balance_loss_mlp": 1.00036037, "epoch": 0.3750488501427927, "flos": 21574843620480.0, "grad_norm": 2.0699756536584633, "language_loss": 0.93258965, "learning_rate": 2.8757542540760508e-06, "loss": 0.95147252, "num_input_tokens_seen": 133969635, "step": 6238, "time_per_iteration": 2.6805243492126465 }, { "auxiliary_loss_clip": 0.01137619, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.04995167, "balance_loss_mlp": 1.02081275, "epoch": 0.3751089733954607, "flos": 15923869274880.0, "grad_norm": 2.3841921025147284, "language_loss": 0.70885909, "learning_rate": 2.8754040978581777e-06, "loss": 0.73060858, "num_input_tokens_seen": 133987215, "step": 6239, "time_per_iteration": 2.548285961151123 }, { "auxiliary_loss_clip": 0.01068531, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.04656243, "balance_loss_mlp": 1.02303219, "epoch": 0.37516909664812864, "flos": 36284644177920.0, "grad_norm": 1.601808094344726, "language_loss": 0.65752542, "learning_rate": 2.875053908444895e-06, "loss": 0.67861104, "num_input_tokens_seen": 134009250, "step": 6240, "time_per_iteration": 3.016897201538086 }, { "auxiliary_loss_clip": 0.01101858, "auxiliary_loss_mlp": 0.00773445, "balance_loss_clip": 1.04618907, "balance_loss_mlp": 1.00033951, "epoch": 0.3752292199007966, "flos": 13515915283200.0, "grad_norm": 2.721418670308367, "language_loss": 0.75816065, "learning_rate": 2.8747036858494795e-06, "loss": 0.7769137, "num_input_tokens_seen": 134026875, "step": 6241, "time_per_iteration": 4.402552843093872 }, { "auxiliary_loss_clip": 0.01103844, "auxiliary_loss_mlp": 0.01044119, "balance_loss_clip": 1.04654765, "balance_loss_mlp": 1.0276264, "epoch": 0.3752893431534646, "flos": 27198095644800.0, "grad_norm": 2.108703330368865, "language_loss": 0.83791685, "learning_rate": 2.874353430085213e-06, "loss": 0.85939646, "num_input_tokens_seen": 134047185, "step": 6242, "time_per_iteration": 2.7508704662323 }, { "auxiliary_loss_clip": 0.01110348, "auxiliary_loss_mlp": 0.01048171, "balance_loss_clip": 1.04799628, "balance_loss_mlp": 1.03319848, "epoch": 0.3753494664061326, "flos": 30007674581760.0, "grad_norm": 2.4924519814208774, "language_loss": 0.68438506, "learning_rate": 2.8740031411653766e-06, "loss": 0.70597029, "num_input_tokens_seen": 134067330, "step": 6243, "time_per_iteration": 2.7814478874206543 }, { "auxiliary_loss_clip": 0.01056696, "auxiliary_loss_mlp": 0.00776554, "balance_loss_clip": 1.04175019, "balance_loss_mlp": 1.00038528, "epoch": 0.37540958965880056, "flos": 24461954064000.0, "grad_norm": 1.7699519682943652, "language_loss": 0.84165168, "learning_rate": 2.8736528191032535e-06, "loss": 0.85998416, "num_input_tokens_seen": 134085525, "step": 6244, "time_per_iteration": 4.510041952133179 }, { "auxiliary_loss_clip": 0.01074238, "auxiliary_loss_mlp": 0.01042872, "balance_loss_clip": 1.03981614, "balance_loss_mlp": 1.02712417, "epoch": 0.3754697129114685, "flos": 16508387295360.0, "grad_norm": 2.7453088605805616, "language_loss": 0.82679987, "learning_rate": 2.8733024639121277e-06, "loss": 0.84797096, "num_input_tokens_seen": 134101855, "step": 6245, "time_per_iteration": 4.745215654373169 }, { "auxiliary_loss_clip": 0.01096909, "auxiliary_loss_mlp": 0.0104658, "balance_loss_clip": 1.04049206, "balance_loss_mlp": 1.0296756, "epoch": 0.3755298361641365, "flos": 19390900798080.0, "grad_norm": 8.46557879021872, "language_loss": 0.63902843, "learning_rate": 2.8729520756052853e-06, "loss": 0.66046333, "num_input_tokens_seen": 134119360, "step": 6246, "time_per_iteration": 4.33053731918335 }, { "auxiliary_loss_clip": 0.01112093, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.04961443, "balance_loss_mlp": 1.0264082, "epoch": 0.37558995941680445, "flos": 14720395069440.0, "grad_norm": 2.0508038288587183, "language_loss": 0.74467009, "learning_rate": 2.8726016541960124e-06, "loss": 0.76622653, "num_input_tokens_seen": 134137475, "step": 6247, "time_per_iteration": 2.688081979751587 }, { "auxiliary_loss_clip": 0.01126872, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.05022037, "balance_loss_mlp": 1.02133489, "epoch": 0.3756500826694724, "flos": 21689901861120.0, "grad_norm": 2.703785960910372, "language_loss": 0.5497098, "learning_rate": 2.872251199697598e-06, "loss": 0.57135224, "num_input_tokens_seen": 134154580, "step": 6248, "time_per_iteration": 2.6308822631835938 }, { "auxiliary_loss_clip": 0.01117073, "auxiliary_loss_mlp": 0.01036379, "balance_loss_clip": 1.04465234, "balance_loss_mlp": 1.0200597, "epoch": 0.3757102059221404, "flos": 26505666190080.0, "grad_norm": 4.209721572066423, "language_loss": 0.84492457, "learning_rate": 2.8719007121233297e-06, "loss": 0.86645913, "num_input_tokens_seen": 134174285, "step": 6249, "time_per_iteration": 2.6539809703826904 }, { "auxiliary_loss_clip": 0.01107733, "auxiliary_loss_mlp": 0.01035495, "balance_loss_clip": 1.04784632, "balance_loss_mlp": 1.01956248, "epoch": 0.37577032917480835, "flos": 37338083274240.0, "grad_norm": 1.546160982958922, "language_loss": 0.67701882, "learning_rate": 2.8715501914864993e-06, "loss": 0.69845104, "num_input_tokens_seen": 134195940, "step": 6250, "time_per_iteration": 2.787398338317871 }, { "auxiliary_loss_clip": 0.01117019, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.04946029, "balance_loss_mlp": 1.0293386, "epoch": 0.3758304524274763, "flos": 21908597817600.0, "grad_norm": 1.960309683567346, "language_loss": 0.77824795, "learning_rate": 2.8711996378003987e-06, "loss": 0.79986179, "num_input_tokens_seen": 134212235, "step": 6251, "time_per_iteration": 2.7143123149871826 }, { "auxiliary_loss_clip": 0.01121024, "auxiliary_loss_mlp": 0.01039102, "balance_loss_clip": 1.04994178, "balance_loss_mlp": 1.0236522, "epoch": 0.3758905756801443, "flos": 36569343375360.0, "grad_norm": 2.527016245081176, "language_loss": 0.58002663, "learning_rate": 2.8708490510783203e-06, "loss": 0.60162789, "num_input_tokens_seen": 134233810, "step": 6252, "time_per_iteration": 2.716597557067871 }, { "auxiliary_loss_clip": 0.01116459, "auxiliary_loss_mlp": 0.01042556, "balance_loss_clip": 1.05007291, "balance_loss_mlp": 1.0260098, "epoch": 0.37595069893281224, "flos": 24528783317760.0, "grad_norm": 4.856643583290163, "language_loss": 0.89482141, "learning_rate": 2.8704984313335584e-06, "loss": 0.91641152, "num_input_tokens_seen": 134252020, "step": 6253, "time_per_iteration": 2.701361894607544 }, { "auxiliary_loss_clip": 0.01098154, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.04815936, "balance_loss_mlp": 1.02562761, "epoch": 0.3760108221854802, "flos": 16435021766400.0, "grad_norm": 2.218099502464204, "language_loss": 0.76568806, "learning_rate": 2.8701477785794097e-06, "loss": 0.78707361, "num_input_tokens_seen": 134269495, "step": 6254, "time_per_iteration": 2.6995303630828857 }, { "auxiliary_loss_clip": 0.01096995, "auxiliary_loss_mlp": 0.01043484, "balance_loss_clip": 1.04379475, "balance_loss_mlp": 1.02628207, "epoch": 0.37607094543814823, "flos": 13771742924160.0, "grad_norm": 2.131769376763656, "language_loss": 0.6180023, "learning_rate": 2.869797092829169e-06, "loss": 0.6394071, "num_input_tokens_seen": 134287035, "step": 6255, "time_per_iteration": 2.7164864540100098 }, { "auxiliary_loss_clip": 0.01127282, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.04883361, "balance_loss_mlp": 1.02017426, "epoch": 0.3761310686908162, "flos": 19857918453120.0, "grad_norm": 2.6629341180561545, "language_loss": 0.74404681, "learning_rate": 2.869446374096135e-06, "loss": 0.76568639, "num_input_tokens_seen": 134304840, "step": 6256, "time_per_iteration": 2.588169574737549 }, { "auxiliary_loss_clip": 0.01127124, "auxiliary_loss_mlp": 0.01046358, "balance_loss_clip": 1.04913831, "balance_loss_mlp": 1.02977645, "epoch": 0.37619119194348416, "flos": 12750802657920.0, "grad_norm": 2.3087979716808937, "language_loss": 0.702447, "learning_rate": 2.8690956223936088e-06, "loss": 0.72418177, "num_input_tokens_seen": 134323180, "step": 6257, "time_per_iteration": 2.701555013656616 }, { "auxiliary_loss_clip": 0.01110787, "auxiliary_loss_mlp": 0.01033343, "balance_loss_clip": 1.04812109, "balance_loss_mlp": 1.01796508, "epoch": 0.3762513151961521, "flos": 17530548624000.0, "grad_norm": 1.673769537318751, "language_loss": 0.84842372, "learning_rate": 2.868744837734889e-06, "loss": 0.86986494, "num_input_tokens_seen": 134341390, "step": 6258, "time_per_iteration": 2.6336703300476074 }, { "auxiliary_loss_clip": 0.01091689, "auxiliary_loss_mlp": 0.01041654, "balance_loss_clip": 1.04571128, "balance_loss_mlp": 1.0271697, "epoch": 0.3763114384488201, "flos": 23617406511360.0, "grad_norm": 1.4940028515654036, "language_loss": 0.80920124, "learning_rate": 2.868394020133277e-06, "loss": 0.83053464, "num_input_tokens_seen": 134360425, "step": 6259, "time_per_iteration": 2.752392053604126 }, { "auxiliary_loss_clip": 0.01093234, "auxiliary_loss_mlp": 0.01046443, "balance_loss_clip": 1.04547083, "balance_loss_mlp": 1.02969444, "epoch": 0.37637156170148806, "flos": 25406978935680.0, "grad_norm": 2.4951694968605627, "language_loss": 0.71285564, "learning_rate": 2.8680431696020783e-06, "loss": 0.73425239, "num_input_tokens_seen": 134379775, "step": 6260, "time_per_iteration": 2.782561779022217 }, { "auxiliary_loss_clip": 0.01107136, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.04386747, "balance_loss_mlp": 1.02305889, "epoch": 0.376431684954156, "flos": 23440906056960.0, "grad_norm": 1.627422352949978, "language_loss": 0.78342533, "learning_rate": 2.867692286154594e-06, "loss": 0.80488986, "num_input_tokens_seen": 134400315, "step": 6261, "time_per_iteration": 2.6978867053985596 }, { "auxiliary_loss_clip": 0.01112259, "auxiliary_loss_mlp": 0.01048861, "balance_loss_clip": 1.04744315, "balance_loss_mlp": 1.0312773, "epoch": 0.376491808206824, "flos": 34204482725760.0, "grad_norm": 2.418447947297228, "language_loss": 0.80871278, "learning_rate": 2.867341369804132e-06, "loss": 0.83032399, "num_input_tokens_seen": 134422875, "step": 6262, "time_per_iteration": 2.852675437927246 }, { "auxiliary_loss_clip": 0.01115101, "auxiliary_loss_mlp": 0.01038136, "balance_loss_clip": 1.04584765, "balance_loss_mlp": 1.02277565, "epoch": 0.37655193145949195, "flos": 35185669614720.0, "grad_norm": 2.9875520790285774, "language_loss": 0.80295742, "learning_rate": 2.866990420563998e-06, "loss": 0.82448983, "num_input_tokens_seen": 134443025, "step": 6263, "time_per_iteration": 2.785395622253418 }, { "auxiliary_loss_clip": 0.01140252, "auxiliary_loss_mlp": 0.01045838, "balance_loss_clip": 1.05247605, "balance_loss_mlp": 1.0300312, "epoch": 0.3766120547121599, "flos": 16761844638720.0, "grad_norm": 2.896352989954936, "language_loss": 0.79601765, "learning_rate": 2.866639438447501e-06, "loss": 0.81787854, "num_input_tokens_seen": 134460945, "step": 6264, "time_per_iteration": 2.581125497817993 }, { "auxiliary_loss_clip": 0.01133548, "auxiliary_loss_mlp": 0.0105155, "balance_loss_clip": 1.04770851, "balance_loss_mlp": 1.03557551, "epoch": 0.3766721779648279, "flos": 23550361776000.0, "grad_norm": 2.0921625870578913, "language_loss": 0.73808366, "learning_rate": 2.8662884234679497e-06, "loss": 0.75993466, "num_input_tokens_seen": 134480440, "step": 6265, "time_per_iteration": 2.6998226642608643 }, { "auxiliary_loss_clip": 0.01123221, "auxiliary_loss_mlp": 0.0103937, "balance_loss_clip": 1.05005145, "balance_loss_mlp": 1.02543402, "epoch": 0.37673230121749585, "flos": 29129191655040.0, "grad_norm": 1.9744000825782282, "language_loss": 0.68550873, "learning_rate": 2.865937375638654e-06, "loss": 0.70713472, "num_input_tokens_seen": 134501110, "step": 6266, "time_per_iteration": 2.6934731006622314 }, { "auxiliary_loss_clip": 0.01128105, "auxiliary_loss_mlp": 0.01041187, "balance_loss_clip": 1.04846668, "balance_loss_mlp": 1.02536833, "epoch": 0.3767924244701638, "flos": 28146783703680.0, "grad_norm": 3.437883319374573, "language_loss": 0.63078731, "learning_rate": 2.8655862949729264e-06, "loss": 0.65248024, "num_input_tokens_seen": 134522460, "step": 6267, "time_per_iteration": 2.7006735801696777 }, { "auxiliary_loss_clip": 0.01050407, "auxiliary_loss_mlp": 0.01011452, "balance_loss_clip": 1.02822745, "balance_loss_mlp": 1.00960469, "epoch": 0.37685254772283183, "flos": 60797197526400.0, "grad_norm": 0.7198108741876666, "language_loss": 0.58852816, "learning_rate": 2.8652351814840795e-06, "loss": 0.60914677, "num_input_tokens_seen": 134589545, "step": 6268, "time_per_iteration": 3.355120897293091 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.01043603, "balance_loss_clip": 1.05033755, "balance_loss_mlp": 1.02698505, "epoch": 0.3769126709754998, "flos": 26032543223040.0, "grad_norm": 2.34128493463531, "language_loss": 0.65263468, "learning_rate": 2.8648840351854283e-06, "loss": 0.67444575, "num_input_tokens_seen": 134610550, "step": 6269, "time_per_iteration": 2.656585931777954 }, { "auxiliary_loss_clip": 0.01099912, "auxiliary_loss_mlp": 0.01041008, "balance_loss_clip": 1.04970932, "balance_loss_mlp": 1.02536798, "epoch": 0.37697279422816776, "flos": 23579879777280.0, "grad_norm": 1.5250715006737088, "language_loss": 0.7069717, "learning_rate": 2.8645328560902874e-06, "loss": 0.72838092, "num_input_tokens_seen": 134630485, "step": 6270, "time_per_iteration": 2.7498419284820557 }, { "auxiliary_loss_clip": 0.01059818, "auxiliary_loss_mlp": 0.01007405, "balance_loss_clip": 1.02900875, "balance_loss_mlp": 1.00581956, "epoch": 0.3770329174808357, "flos": 64745935367040.0, "grad_norm": 0.7193704591933474, "language_loss": 0.56122422, "learning_rate": 2.8641816442119746e-06, "loss": 0.58189648, "num_input_tokens_seen": 134693510, "step": 6271, "time_per_iteration": 3.1569089889526367 }, { "auxiliary_loss_clip": 0.01121208, "auxiliary_loss_mlp": 0.01042721, "balance_loss_clip": 1.04645681, "balance_loss_mlp": 1.02609181, "epoch": 0.3770930407335037, "flos": 21835304115840.0, "grad_norm": 2.1611051517344246, "language_loss": 0.79855239, "learning_rate": 2.8638303995638066e-06, "loss": 0.82019162, "num_input_tokens_seen": 134713115, "step": 6272, "time_per_iteration": 2.628180742263794 }, { "auxiliary_loss_clip": 0.01118748, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.0451988, "balance_loss_mlp": 1.01934206, "epoch": 0.37715316398617166, "flos": 22747901984640.0, "grad_norm": 2.0954681641544304, "language_loss": 0.73789483, "learning_rate": 2.863479122159103e-06, "loss": 0.75941932, "num_input_tokens_seen": 134732635, "step": 6273, "time_per_iteration": 2.7064390182495117 }, { "auxiliary_loss_clip": 0.01117899, "auxiliary_loss_mlp": 0.01044408, "balance_loss_clip": 1.04745209, "balance_loss_mlp": 1.02905381, "epoch": 0.3772132872388396, "flos": 18914581520640.0, "grad_norm": 1.6440580648783938, "language_loss": 0.71867502, "learning_rate": 2.8631278120111858e-06, "loss": 0.74029803, "num_input_tokens_seen": 134750695, "step": 6274, "time_per_iteration": 2.650559186935425 }, { "auxiliary_loss_clip": 0.01105418, "auxiliary_loss_mlp": 0.01040714, "balance_loss_clip": 1.04509926, "balance_loss_mlp": 1.02567029, "epoch": 0.3772734104915076, "flos": 17346219004800.0, "grad_norm": 1.9251108643001593, "language_loss": 0.83620244, "learning_rate": 2.8627764691333742e-06, "loss": 0.85766381, "num_input_tokens_seen": 134768935, "step": 6275, "time_per_iteration": 2.662346839904785 }, { "auxiliary_loss_clip": 0.01077547, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.04383206, "balance_loss_mlp": 1.02238655, "epoch": 0.37733353374417555, "flos": 32342370785280.0, "grad_norm": 1.4850375213112275, "language_loss": 0.75779188, "learning_rate": 2.8624250935389935e-06, "loss": 0.77892679, "num_input_tokens_seen": 134791260, "step": 6276, "time_per_iteration": 2.824374198913574 }, { "auxiliary_loss_clip": 0.01109985, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.04301822, "balance_loss_mlp": 1.02318192, "epoch": 0.3773936569968435, "flos": 23360681030400.0, "grad_norm": 1.996464283971086, "language_loss": 0.85758084, "learning_rate": 2.862073685241366e-06, "loss": 0.87907803, "num_input_tokens_seen": 134808350, "step": 6277, "time_per_iteration": 2.6880812644958496 }, { "auxiliary_loss_clip": 0.01123239, "auxiliary_loss_mlp": 0.01035838, "balance_loss_clip": 1.04981339, "balance_loss_mlp": 1.02147365, "epoch": 0.3774537802495115, "flos": 21466788531840.0, "grad_norm": 2.8692620956149613, "language_loss": 0.78788501, "learning_rate": 2.861722244253818e-06, "loss": 0.80947578, "num_input_tokens_seen": 134826005, "step": 6278, "time_per_iteration": 2.6566152572631836 }, { "auxiliary_loss_clip": 0.01104603, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.04592609, "balance_loss_mlp": 1.02740717, "epoch": 0.37751390350217945, "flos": 24973717086720.0, "grad_norm": 2.420687530183356, "language_loss": 0.8289634, "learning_rate": 2.8613707705896767e-06, "loss": 0.85045302, "num_input_tokens_seen": 134844995, "step": 6279, "time_per_iteration": 2.732966899871826 }, { "auxiliary_loss_clip": 0.01110227, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.04498839, "balance_loss_mlp": 1.02520263, "epoch": 0.3775740267548474, "flos": 27819098904960.0, "grad_norm": 5.36242068768128, "language_loss": 0.74968797, "learning_rate": 2.861019264262269e-06, "loss": 0.77118295, "num_input_tokens_seen": 134865285, "step": 6280, "time_per_iteration": 4.266780376434326 }, { "auxiliary_loss_clip": 0.01130932, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.04845715, "balance_loss_mlp": 1.02235854, "epoch": 0.3776341500075154, "flos": 22565224391040.0, "grad_norm": 1.4530407212668277, "language_loss": 0.76169163, "learning_rate": 2.8606677252849242e-06, "loss": 0.7833612, "num_input_tokens_seen": 134886535, "step": 6281, "time_per_iteration": 2.649930477142334 }, { "auxiliary_loss_clip": 0.01101629, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.04291892, "balance_loss_mlp": 1.02471018, "epoch": 0.3776942732601834, "flos": 23077238808960.0, "grad_norm": 2.430303484367767, "language_loss": 0.83814883, "learning_rate": 2.860316153670974e-06, "loss": 0.85956836, "num_input_tokens_seen": 134907435, "step": 6282, "time_per_iteration": 2.6882312297821045 }, { "auxiliary_loss_clip": 0.0111945, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04452085, "balance_loss_mlp": 1.02134025, "epoch": 0.37775439651285136, "flos": 21724411852800.0, "grad_norm": 2.5787880774083725, "language_loss": 0.698241, "learning_rate": 2.8599645494337484e-06, "loss": 0.71980345, "num_input_tokens_seen": 134925360, "step": 6283, "time_per_iteration": 4.2020978927612305 }, { "auxiliary_loss_clip": 0.01072442, "auxiliary_loss_mlp": 0.01052062, "balance_loss_clip": 1.04226279, "balance_loss_mlp": 1.03394175, "epoch": 0.37781451976551933, "flos": 23987753688960.0, "grad_norm": 2.007181392308561, "language_loss": 0.76503819, "learning_rate": 2.859612912586581e-06, "loss": 0.78628325, "num_input_tokens_seen": 134944205, "step": 6284, "time_per_iteration": 4.349794387817383 }, { "auxiliary_loss_clip": 0.01142581, "auxiliary_loss_mlp": 0.01033355, "balance_loss_clip": 1.05249381, "balance_loss_mlp": 1.01713097, "epoch": 0.3778746430181873, "flos": 13727967223680.0, "grad_norm": 2.7318562260547554, "language_loss": 0.85677552, "learning_rate": 2.8592612431428055e-06, "loss": 0.87853491, "num_input_tokens_seen": 134960255, "step": 6285, "time_per_iteration": 2.6949870586395264 }, { "auxiliary_loss_clip": 0.01111269, "auxiliary_loss_mlp": 0.01042933, "balance_loss_clip": 1.04731882, "balance_loss_mlp": 1.02694702, "epoch": 0.37793476627085526, "flos": 19460495399040.0, "grad_norm": 1.8544385642750592, "language_loss": 0.84419537, "learning_rate": 2.858909541115758e-06, "loss": 0.86573738, "num_input_tokens_seen": 134978605, "step": 6286, "time_per_iteration": 4.541024684906006 }, { "auxiliary_loss_clip": 0.01120151, "auxiliary_loss_mlp": 0.01043503, "balance_loss_clip": 1.05024576, "balance_loss_mlp": 1.0280652, "epoch": 0.3779948895235232, "flos": 10707018704640.0, "grad_norm": 2.400905995704231, "language_loss": 0.81738019, "learning_rate": 2.858557806518775e-06, "loss": 0.83901674, "num_input_tokens_seen": 134995020, "step": 6287, "time_per_iteration": 2.6611125469207764 }, { "auxiliary_loss_clip": 0.01118978, "auxiliary_loss_mlp": 0.01041796, "balance_loss_clip": 1.04537022, "balance_loss_mlp": 1.02645934, "epoch": 0.3780550127761912, "flos": 22310007281280.0, "grad_norm": 3.0671932020533133, "language_loss": 0.73071134, "learning_rate": 2.8582060393651927e-06, "loss": 0.7523191, "num_input_tokens_seen": 135012620, "step": 6288, "time_per_iteration": 2.6759073734283447 }, { "auxiliary_loss_clip": 0.01124666, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.05113983, "balance_loss_mlp": 1.02115071, "epoch": 0.37811513602885916, "flos": 28950644125440.0, "grad_norm": 1.9644960153972613, "language_loss": 0.75616127, "learning_rate": 2.857854239668352e-06, "loss": 0.77777576, "num_input_tokens_seen": 135033365, "step": 6289, "time_per_iteration": 2.656367778778076 }, { "auxiliary_loss_clip": 0.0112159, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.04737473, "balance_loss_mlp": 1.02025056, "epoch": 0.3781752592815271, "flos": 23112933949440.0, "grad_norm": 1.7941331023092641, "language_loss": 0.73271513, "learning_rate": 2.857502407441593e-06, "loss": 0.75428718, "num_input_tokens_seen": 135052185, "step": 6290, "time_per_iteration": 2.740370512008667 }, { "auxiliary_loss_clip": 0.01098389, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.04425681, "balance_loss_mlp": 1.023193, "epoch": 0.3782353825341951, "flos": 19755932762880.0, "grad_norm": 8.943174604406142, "language_loss": 0.79843229, "learning_rate": 2.8571505426982566e-06, "loss": 0.81982636, "num_input_tokens_seen": 135070425, "step": 6291, "time_per_iteration": 2.729116916656494 }, { "auxiliary_loss_clip": 0.01101536, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.04736066, "balance_loss_mlp": 1.01611638, "epoch": 0.37829550578686305, "flos": 22050839675520.0, "grad_norm": 2.1381581001103203, "language_loss": 0.76017123, "learning_rate": 2.8567986454516854e-06, "loss": 0.78151298, "num_input_tokens_seen": 135090525, "step": 6292, "time_per_iteration": 2.7115557193756104 }, { "auxiliary_loss_clip": 0.0111659, "auxiliary_loss_mlp": 0.01045333, "balance_loss_clip": 1.04599166, "balance_loss_mlp": 1.02922773, "epoch": 0.378355629039531, "flos": 16470357770880.0, "grad_norm": 2.0329947363530616, "language_loss": 0.69857049, "learning_rate": 2.856446715715224e-06, "loss": 0.72018969, "num_input_tokens_seen": 135109575, "step": 6293, "time_per_iteration": 2.6687965393066406 }, { "auxiliary_loss_clip": 0.01133204, "auxiliary_loss_mlp": 0.01039264, "balance_loss_clip": 1.04852223, "balance_loss_mlp": 1.02307534, "epoch": 0.378415752292199, "flos": 19974844200960.0, "grad_norm": 2.030259976194038, "language_loss": 0.70870757, "learning_rate": 2.8560947535022173e-06, "loss": 0.73043227, "num_input_tokens_seen": 135127000, "step": 6294, "time_per_iteration": 2.600249767303467 }, { "auxiliary_loss_clip": 0.01115678, "auxiliary_loss_mlp": 0.01040569, "balance_loss_clip": 1.04706097, "balance_loss_mlp": 1.02365303, "epoch": 0.378475875544867, "flos": 14647388676480.0, "grad_norm": 4.788626069957177, "language_loss": 0.82803214, "learning_rate": 2.855742758826011e-06, "loss": 0.84959471, "num_input_tokens_seen": 135145285, "step": 6295, "time_per_iteration": 2.656090497970581 }, { "auxiliary_loss_clip": 0.0111937, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.04782999, "balance_loss_mlp": 1.02058005, "epoch": 0.37853599879753497, "flos": 26650996617600.0, "grad_norm": 9.577751233202987, "language_loss": 0.71744889, "learning_rate": 2.8553907316999547e-06, "loss": 0.73900783, "num_input_tokens_seen": 135165240, "step": 6296, "time_per_iteration": 2.6698925495147705 }, { "auxiliary_loss_clip": 0.01134516, "auxiliary_loss_mlp": 0.01043376, "balance_loss_clip": 1.05133939, "balance_loss_mlp": 1.02771211, "epoch": 0.37859612205020293, "flos": 17311960408320.0, "grad_norm": 3.288847845161644, "language_loss": 0.76889098, "learning_rate": 2.855038672137396e-06, "loss": 0.79066986, "num_input_tokens_seen": 135184045, "step": 6297, "time_per_iteration": 2.629037380218506 }, { "auxiliary_loss_clip": 0.01109354, "auxiliary_loss_mlp": 0.01038115, "balance_loss_clip": 1.04526067, "balance_loss_mlp": 1.02226055, "epoch": 0.3786562453028709, "flos": 18220392299520.0, "grad_norm": 1.9191527971099975, "language_loss": 0.79743183, "learning_rate": 2.854686580151684e-06, "loss": 0.81890655, "num_input_tokens_seen": 135202365, "step": 6298, "time_per_iteration": 2.673081874847412 }, { "auxiliary_loss_clip": 0.01075918, "auxiliary_loss_mlp": 0.01051187, "balance_loss_clip": 1.04113722, "balance_loss_mlp": 1.03267384, "epoch": 0.37871636855553886, "flos": 21214875473280.0, "grad_norm": 1.8248163373816215, "language_loss": 0.84369445, "learning_rate": 2.8543344557561722e-06, "loss": 0.86496556, "num_input_tokens_seen": 135220955, "step": 6299, "time_per_iteration": 2.748072862625122 }, { "auxiliary_loss_clip": 0.01104171, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.0473597, "balance_loss_mlp": 1.02021194, "epoch": 0.3787764918082068, "flos": 20952727038720.0, "grad_norm": 2.2683019346862587, "language_loss": 0.76286763, "learning_rate": 2.8539822989642116e-06, "loss": 0.78427088, "num_input_tokens_seen": 135239715, "step": 6300, "time_per_iteration": 2.742335796356201 }, { "auxiliary_loss_clip": 0.01118244, "auxiliary_loss_mlp": 0.01037884, "balance_loss_clip": 1.04743147, "balance_loss_mlp": 1.01999068, "epoch": 0.3788366150608748, "flos": 17308009912320.0, "grad_norm": 2.2544575031135863, "language_loss": 0.82409781, "learning_rate": 2.8536301097891577e-06, "loss": 0.84565908, "num_input_tokens_seen": 135257035, "step": 6301, "time_per_iteration": 2.6785736083984375 }, { "auxiliary_loss_clip": 0.01120863, "auxiliary_loss_mlp": 0.01039969, "balance_loss_clip": 1.04765666, "balance_loss_mlp": 1.02410781, "epoch": 0.37889673831354276, "flos": 24311092942080.0, "grad_norm": 2.7886341766039466, "language_loss": 0.67584914, "learning_rate": 2.8532778882443636e-06, "loss": 0.69745743, "num_input_tokens_seen": 135275720, "step": 6302, "time_per_iteration": 2.677690029144287 }, { "auxiliary_loss_clip": 0.01090953, "auxiliary_loss_mlp": 0.01043064, "balance_loss_clip": 1.04460323, "balance_loss_mlp": 1.02736425, "epoch": 0.3789568615662107, "flos": 26683603188480.0, "grad_norm": 1.752291551629032, "language_loss": 0.68745166, "learning_rate": 2.8529256343431867e-06, "loss": 0.70879185, "num_input_tokens_seen": 135294140, "step": 6303, "time_per_iteration": 2.8387813568115234 }, { "auxiliary_loss_clip": 0.01133092, "auxiliary_loss_mlp": 0.01039166, "balance_loss_clip": 1.04745388, "balance_loss_mlp": 1.02412772, "epoch": 0.3790169848188787, "flos": 23585194990080.0, "grad_norm": 1.8875159078783896, "language_loss": 0.77695227, "learning_rate": 2.8525733480989846e-06, "loss": 0.79867482, "num_input_tokens_seen": 135314845, "step": 6304, "time_per_iteration": 2.673499584197998 }, { "auxiliary_loss_clip": 0.01145067, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.05417812, "balance_loss_mlp": 1.02412987, "epoch": 0.37907710807154665, "flos": 18437436230400.0, "grad_norm": 2.779181085633227, "language_loss": 0.79659361, "learning_rate": 2.8522210295251146e-06, "loss": 0.81845009, "num_input_tokens_seen": 135333055, "step": 6305, "time_per_iteration": 2.5770838260650635 }, { "auxiliary_loss_clip": 0.01046795, "auxiliary_loss_mlp": 0.01001141, "balance_loss_clip": 1.02554131, "balance_loss_mlp": 0.99954396, "epoch": 0.3791372313242146, "flos": 50107165954560.0, "grad_norm": 0.9814261912828969, "language_loss": 0.64473259, "learning_rate": 2.8518686786349387e-06, "loss": 0.66521198, "num_input_tokens_seen": 135387865, "step": 6306, "time_per_iteration": 3.0782721042633057 }, { "auxiliary_loss_clip": 0.01111605, "auxiliary_loss_mlp": 0.01058558, "balance_loss_clip": 1.04987538, "balance_loss_mlp": 1.03932941, "epoch": 0.3791973545768826, "flos": 24316551809280.0, "grad_norm": 3.4757923579383343, "language_loss": 0.73271245, "learning_rate": 2.851516295441817e-06, "loss": 0.75441408, "num_input_tokens_seen": 135409095, "step": 6307, "time_per_iteration": 2.756335973739624 }, { "auxiliary_loss_clip": 0.01112868, "auxiliary_loss_mlp": 0.01041837, "balance_loss_clip": 1.04757965, "balance_loss_mlp": 1.02545738, "epoch": 0.3792574778295506, "flos": 21579907438080.0, "grad_norm": 1.5984922637838355, "language_loss": 0.78426826, "learning_rate": 2.851163879959112e-06, "loss": 0.80581522, "num_input_tokens_seen": 135429585, "step": 6308, "time_per_iteration": 2.7782399654388428 }, { "auxiliary_loss_clip": 0.01099815, "auxiliary_loss_mlp": 0.01047567, "balance_loss_clip": 1.04646075, "balance_loss_mlp": 1.03061557, "epoch": 0.37931760108221857, "flos": 22272731942400.0, "grad_norm": 30.20771720098995, "language_loss": 0.72349942, "learning_rate": 2.8508114322001876e-06, "loss": 0.74497324, "num_input_tokens_seen": 135446320, "step": 6309, "time_per_iteration": 2.779332399368286 }, { "auxiliary_loss_clip": 0.0107726, "auxiliary_loss_mlp": 0.01047463, "balance_loss_clip": 1.04217935, "balance_loss_mlp": 1.03061867, "epoch": 0.37937772433488653, "flos": 19682998197120.0, "grad_norm": 1.3823910789919382, "language_loss": 0.78832853, "learning_rate": 2.8504589521784083e-06, "loss": 0.8095758, "num_input_tokens_seen": 135465720, "step": 6310, "time_per_iteration": 2.771423101425171 }, { "auxiliary_loss_clip": 0.01125039, "auxiliary_loss_mlp": 0.0077385, "balance_loss_clip": 1.04667282, "balance_loss_mlp": 1.00038886, "epoch": 0.3794378475875545, "flos": 19099378016640.0, "grad_norm": 2.0276391959107687, "language_loss": 0.76350379, "learning_rate": 2.8501064399071403e-06, "loss": 0.78249264, "num_input_tokens_seen": 135485155, "step": 6311, "time_per_iteration": 2.6458020210266113 }, { "auxiliary_loss_clip": 0.01111162, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.04782593, "balance_loss_mlp": 1.02345526, "epoch": 0.37949797084022246, "flos": 20339660684160.0, "grad_norm": 1.662830094695082, "language_loss": 0.7082535, "learning_rate": 2.8497538953997504e-06, "loss": 0.72975308, "num_input_tokens_seen": 135502675, "step": 6312, "time_per_iteration": 2.719555377960205 }, { "auxiliary_loss_clip": 0.01023104, "auxiliary_loss_mlp": 0.01013837, "balance_loss_clip": 1.02154779, "balance_loss_mlp": 1.0123291, "epoch": 0.37955809409289043, "flos": 63972203477760.0, "grad_norm": 0.7865225154891, "language_loss": 0.56087357, "learning_rate": 2.849401318669608e-06, "loss": 0.58124298, "num_input_tokens_seen": 135562005, "step": 6313, "time_per_iteration": 3.2287843227386475 }, { "auxiliary_loss_clip": 0.01096229, "auxiliary_loss_mlp": 0.01051812, "balance_loss_clip": 1.04299724, "balance_loss_mlp": 1.03592694, "epoch": 0.3796182173455584, "flos": 31540665179520.0, "grad_norm": 1.6673731637282567, "language_loss": 0.71260917, "learning_rate": 2.849048709730083e-06, "loss": 0.73408955, "num_input_tokens_seen": 135582600, "step": 6314, "time_per_iteration": 2.7842931747436523 }, { "auxiliary_loss_clip": 0.01129376, "auxiliary_loss_mlp": 0.01048605, "balance_loss_clip": 1.04880047, "balance_loss_mlp": 1.03201127, "epoch": 0.37967834059822636, "flos": 12130804978560.0, "grad_norm": 2.0299747539506408, "language_loss": 0.73270208, "learning_rate": 2.848696068594545e-06, "loss": 0.75448191, "num_input_tokens_seen": 135600280, "step": 6315, "time_per_iteration": 2.6785545349121094 }, { "auxiliary_loss_clip": 0.01122054, "auxiliary_loss_mlp": 0.01048691, "balance_loss_clip": 1.0479691, "balance_loss_mlp": 1.03326535, "epoch": 0.3797384638508943, "flos": 39348578298240.0, "grad_norm": 2.0273248392275645, "language_loss": 0.71108794, "learning_rate": 2.8483433952763677e-06, "loss": 0.73279542, "num_input_tokens_seen": 135621560, "step": 6316, "time_per_iteration": 2.7634074687957764 }, { "auxiliary_loss_clip": 0.01099766, "auxiliary_loss_mlp": 0.01041876, "balance_loss_clip": 1.04686475, "balance_loss_mlp": 1.02733219, "epoch": 0.3797985871035623, "flos": 34054016653440.0, "grad_norm": 6.091183487708486, "language_loss": 0.6551193, "learning_rate": 2.847990689788923e-06, "loss": 0.67653567, "num_input_tokens_seen": 135641745, "step": 6317, "time_per_iteration": 2.8334715366363525 }, { "auxiliary_loss_clip": 0.01119227, "auxiliary_loss_mlp": 0.01036315, "balance_loss_clip": 1.04556906, "balance_loss_mlp": 1.02204525, "epoch": 0.37985871035623026, "flos": 23222174186880.0, "grad_norm": 2.5588148844770364, "language_loss": 0.85254991, "learning_rate": 2.8476379521455877e-06, "loss": 0.87410533, "num_input_tokens_seen": 135660650, "step": 6318, "time_per_iteration": 2.6611499786376953 }, { "auxiliary_loss_clip": 0.01113843, "auxiliary_loss_mlp": 0.01046062, "balance_loss_clip": 1.04669976, "balance_loss_mlp": 1.02933645, "epoch": 0.3799188336088982, "flos": 18114958903680.0, "grad_norm": 2.5013130494780254, "language_loss": 0.75813186, "learning_rate": 2.8472851823597354e-06, "loss": 0.77973092, "num_input_tokens_seen": 135679980, "step": 6319, "time_per_iteration": 2.643206834793091 }, { "auxiliary_loss_clip": 0.01136645, "auxiliary_loss_mlp": 0.01043703, "balance_loss_clip": 1.04961717, "balance_loss_mlp": 1.02813435, "epoch": 0.3799789568615662, "flos": 21871897096320.0, "grad_norm": 1.6614251909537696, "language_loss": 0.64298296, "learning_rate": 2.846932380444744e-06, "loss": 0.66478646, "num_input_tokens_seen": 135699400, "step": 6320, "time_per_iteration": 4.031519174575806 }, { "auxiliary_loss_clip": 0.01102323, "auxiliary_loss_mlp": 0.01046665, "balance_loss_clip": 1.05175698, "balance_loss_mlp": 1.03132319, "epoch": 0.3800390801142342, "flos": 32962943082240.0, "grad_norm": 2.289587921641626, "language_loss": 0.713642, "learning_rate": 2.846579546413992e-06, "loss": 0.73513186, "num_input_tokens_seen": 135723455, "step": 6321, "time_per_iteration": 2.8465514183044434 }, { "auxiliary_loss_clip": 0.01096183, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.04067016, "balance_loss_mlp": 1.02673435, "epoch": 0.38009920336690217, "flos": 26907075653760.0, "grad_norm": 1.7413772853733611, "language_loss": 0.74461544, "learning_rate": 2.846226680280859e-06, "loss": 0.76599777, "num_input_tokens_seen": 135744335, "step": 6322, "time_per_iteration": 4.407487630844116 }, { "auxiliary_loss_clip": 0.01122719, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.0462966, "balance_loss_mlp": 1.02587986, "epoch": 0.38015932661957014, "flos": 22488913946880.0, "grad_norm": 3.5770930684707527, "language_loss": 0.84908414, "learning_rate": 2.845873782058725e-06, "loss": 0.87071967, "num_input_tokens_seen": 135761440, "step": 6323, "time_per_iteration": 2.6349892616271973 }, { "auxiliary_loss_clip": 0.01111414, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.04454303, "balance_loss_mlp": 1.02075982, "epoch": 0.3802194498722381, "flos": 21980993679360.0, "grad_norm": 5.3693824839272954, "language_loss": 0.73171353, "learning_rate": 2.845520851760973e-06, "loss": 0.75320327, "num_input_tokens_seen": 135779955, "step": 6324, "time_per_iteration": 4.240839958190918 }, { "auxiliary_loss_clip": 0.01105568, "auxiliary_loss_mlp": 0.01038696, "balance_loss_clip": 1.04704404, "balance_loss_mlp": 1.02263856, "epoch": 0.38027957312490607, "flos": 21324869896320.0, "grad_norm": 1.716026134262254, "language_loss": 0.83859229, "learning_rate": 2.8451678894009847e-06, "loss": 0.86003488, "num_input_tokens_seen": 135799840, "step": 6325, "time_per_iteration": 2.72074818611145 }, { "auxiliary_loss_clip": 0.01110489, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.04811895, "balance_loss_mlp": 1.02094209, "epoch": 0.38033969637757403, "flos": 16691244456960.0, "grad_norm": 2.0321742163093264, "language_loss": 0.80093408, "learning_rate": 2.8448148949921465e-06, "loss": 0.82239556, "num_input_tokens_seen": 135817880, "step": 6326, "time_per_iteration": 4.313997030258179 }, { "auxiliary_loss_clip": 0.01119893, "auxiliary_loss_mlp": 0.01038876, "balance_loss_clip": 1.04593146, "balance_loss_mlp": 1.02497053, "epoch": 0.380399819630242, "flos": 36210847685760.0, "grad_norm": 1.80559395505396, "language_loss": 0.72578084, "learning_rate": 2.844461868547842e-06, "loss": 0.74736857, "num_input_tokens_seen": 135838940, "step": 6327, "time_per_iteration": 2.7500593662261963 }, { "auxiliary_loss_clip": 0.01134332, "auxiliary_loss_mlp": 0.00772576, "balance_loss_clip": 1.04898763, "balance_loss_mlp": 1.00039506, "epoch": 0.38045994288290996, "flos": 21288851533440.0, "grad_norm": 1.9791898832174752, "language_loss": 0.83074433, "learning_rate": 2.844108810081459e-06, "loss": 0.84981334, "num_input_tokens_seen": 135858325, "step": 6328, "time_per_iteration": 2.7503418922424316 }, { "auxiliary_loss_clip": 0.01119735, "auxiliary_loss_mlp": 0.01029986, "balance_loss_clip": 1.04522514, "balance_loss_mlp": 1.01522779, "epoch": 0.38052006613557793, "flos": 20922885815040.0, "grad_norm": 1.5313878449465446, "language_loss": 0.61713332, "learning_rate": 2.843755719606385e-06, "loss": 0.63863051, "num_input_tokens_seen": 135878430, "step": 6329, "time_per_iteration": 2.682016134262085 }, { "auxiliary_loss_clip": 0.01103557, "auxiliary_loss_mlp": 0.01040275, "balance_loss_clip": 1.04332185, "balance_loss_mlp": 1.02436066, "epoch": 0.3805801893882459, "flos": 20990720649600.0, "grad_norm": 1.9096594726999414, "language_loss": 0.56007183, "learning_rate": 2.8434025971360104e-06, "loss": 0.58151013, "num_input_tokens_seen": 135894755, "step": 6330, "time_per_iteration": 2.6704044342041016 }, { "auxiliary_loss_clip": 0.01088801, "auxiliary_loss_mlp": 0.01035148, "balance_loss_clip": 1.04801345, "balance_loss_mlp": 1.02142704, "epoch": 0.38064031264091386, "flos": 25558594243200.0, "grad_norm": 3.9882905607247046, "language_loss": 0.65945244, "learning_rate": 2.8430494426837243e-06, "loss": 0.6806919, "num_input_tokens_seen": 135918275, "step": 6331, "time_per_iteration": 2.750293731689453 }, { "auxiliary_loss_clip": 0.01120934, "auxiliary_loss_mlp": 0.01042908, "balance_loss_clip": 1.05122471, "balance_loss_mlp": 1.02723169, "epoch": 0.3807004358935818, "flos": 15085857997440.0, "grad_norm": 2.769340057272882, "language_loss": 0.7601527, "learning_rate": 2.842696256262919e-06, "loss": 0.78179109, "num_input_tokens_seen": 135937430, "step": 6332, "time_per_iteration": 2.64774227142334 }, { "auxiliary_loss_clip": 0.01073508, "auxiliary_loss_mlp": 0.00772959, "balance_loss_clip": 1.04594767, "balance_loss_mlp": 1.00029111, "epoch": 0.3807605591462498, "flos": 16399398453120.0, "grad_norm": 2.059894273755589, "language_loss": 0.8224051, "learning_rate": 2.842343037886987e-06, "loss": 0.84086972, "num_input_tokens_seen": 135954210, "step": 6333, "time_per_iteration": 2.7650275230407715 }, { "auxiliary_loss_clip": 0.01121534, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.04730785, "balance_loss_mlp": 1.01878643, "epoch": 0.3808206823989178, "flos": 29057083102080.0, "grad_norm": 1.5368445040683132, "language_loss": 0.8620519, "learning_rate": 2.8419897875693226e-06, "loss": 0.88359934, "num_input_tokens_seen": 135974425, "step": 6334, "time_per_iteration": 2.7348363399505615 }, { "auxiliary_loss_clip": 0.01123412, "auxiliary_loss_mlp": 0.01038067, "balance_loss_clip": 1.04626036, "balance_loss_mlp": 1.02280819, "epoch": 0.3808808056515858, "flos": 15705855676800.0, "grad_norm": 1.7714454860846107, "language_loss": 0.79359698, "learning_rate": 2.841636505323321e-06, "loss": 0.81521177, "num_input_tokens_seen": 135991985, "step": 6335, "time_per_iteration": 2.7020695209503174 }, { "auxiliary_loss_clip": 0.01121693, "auxiliary_loss_mlp": 0.01033758, "balance_loss_clip": 1.04490542, "balance_loss_mlp": 1.01847494, "epoch": 0.38094092890425374, "flos": 20704584908160.0, "grad_norm": 1.872444579903983, "language_loss": 0.72939491, "learning_rate": 2.8412831911623795e-06, "loss": 0.75094938, "num_input_tokens_seen": 136010015, "step": 6336, "time_per_iteration": 2.7088463306427 }, { "auxiliary_loss_clip": 0.01117324, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.04605365, "balance_loss_mlp": 1.01930285, "epoch": 0.3810010521569217, "flos": 20667956014080.0, "grad_norm": 2.014308937626889, "language_loss": 0.69164217, "learning_rate": 2.840929845099894e-06, "loss": 0.71314949, "num_input_tokens_seen": 136028440, "step": 6337, "time_per_iteration": 2.6832611560821533 }, { "auxiliary_loss_clip": 0.01111033, "auxiliary_loss_mlp": 0.01036513, "balance_loss_clip": 1.04483473, "balance_loss_mlp": 1.02133763, "epoch": 0.38106117540958967, "flos": 31827626933760.0, "grad_norm": 1.9800177042646252, "language_loss": 0.63416338, "learning_rate": 2.8405764671492652e-06, "loss": 0.65563887, "num_input_tokens_seen": 136048360, "step": 6338, "time_per_iteration": 2.8045074939727783 }, { "auxiliary_loss_clip": 0.01112594, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.04514265, "balance_loss_mlp": 1.02520001, "epoch": 0.38112129866225763, "flos": 16902757693440.0, "grad_norm": 2.42049576026076, "language_loss": 0.69146717, "learning_rate": 2.8402230573238923e-06, "loss": 0.713009, "num_input_tokens_seen": 136065500, "step": 6339, "time_per_iteration": 2.6873764991760254 }, { "auxiliary_loss_clip": 0.01107753, "auxiliary_loss_mlp": 0.01047128, "balance_loss_clip": 1.04493856, "balance_loss_mlp": 1.03165436, "epoch": 0.3811814219149256, "flos": 20887226588160.0, "grad_norm": 2.484915003961603, "language_loss": 0.68283296, "learning_rate": 2.839869615637177e-06, "loss": 0.70438182, "num_input_tokens_seen": 136084060, "step": 6340, "time_per_iteration": 2.730966567993164 }, { "auxiliary_loss_clip": 0.01098909, "auxiliary_loss_mlp": 0.01040765, "balance_loss_clip": 1.0444243, "balance_loss_mlp": 1.02449322, "epoch": 0.38124154516759357, "flos": 16690813493760.0, "grad_norm": 2.645956512625022, "language_loss": 0.89689833, "learning_rate": 2.839516142102522e-06, "loss": 0.91829509, "num_input_tokens_seen": 136102310, "step": 6341, "time_per_iteration": 2.7552878856658936 }, { "auxiliary_loss_clip": 0.01127861, "auxiliary_loss_mlp": 0.01042909, "balance_loss_clip": 1.04863834, "balance_loss_mlp": 1.02668464, "epoch": 0.38130166842026153, "flos": 19681956702720.0, "grad_norm": 2.1539523414578103, "language_loss": 0.75359344, "learning_rate": 2.83916263673333e-06, "loss": 0.7753011, "num_input_tokens_seen": 136120725, "step": 6342, "time_per_iteration": 2.6937670707702637 }, { "auxiliary_loss_clip": 0.01109868, "auxiliary_loss_mlp": 0.01035797, "balance_loss_clip": 1.04506934, "balance_loss_mlp": 1.02071738, "epoch": 0.3813617916729295, "flos": 22198432659840.0, "grad_norm": 1.797512240627555, "language_loss": 0.8348105, "learning_rate": 2.838809099543007e-06, "loss": 0.85626709, "num_input_tokens_seen": 136139105, "step": 6343, "time_per_iteration": 2.6647467613220215 }, { "auxiliary_loss_clip": 0.01073856, "auxiliary_loss_mlp": 0.01047466, "balance_loss_clip": 1.04339314, "balance_loss_mlp": 1.03099144, "epoch": 0.38142191492559746, "flos": 19096899978240.0, "grad_norm": 1.8507846773973766, "language_loss": 0.76930642, "learning_rate": 2.838455530544959e-06, "loss": 0.7905196, "num_input_tokens_seen": 136158265, "step": 6344, "time_per_iteration": 2.807464838027954 }, { "auxiliary_loss_clip": 0.01099031, "auxiliary_loss_mlp": 0.01049913, "balance_loss_clip": 1.04580665, "balance_loss_mlp": 1.03225255, "epoch": 0.3814820381782654, "flos": 24097748112000.0, "grad_norm": 2.0591822661314847, "language_loss": 0.73010087, "learning_rate": 2.838101929752593e-06, "loss": 0.75159037, "num_input_tokens_seen": 136176100, "step": 6345, "time_per_iteration": 2.756462574005127 }, { "auxiliary_loss_clip": 0.01094565, "auxiliary_loss_mlp": 0.00771987, "balance_loss_clip": 1.04568338, "balance_loss_mlp": 1.00028944, "epoch": 0.3815421614309334, "flos": 15778502933760.0, "grad_norm": 1.8320535118847152, "language_loss": 0.69709373, "learning_rate": 2.8377482971793187e-06, "loss": 0.71575922, "num_input_tokens_seen": 136195125, "step": 6346, "time_per_iteration": 2.7221782207489014 }, { "auxiliary_loss_clip": 0.01124746, "auxiliary_loss_mlp": 0.01038046, "balance_loss_clip": 1.04819, "balance_loss_mlp": 1.02297819, "epoch": 0.38160228468360136, "flos": 19899754819200.0, "grad_norm": 1.9952986193352877, "language_loss": 0.75480664, "learning_rate": 2.8373946328385437e-06, "loss": 0.77643454, "num_input_tokens_seen": 136213885, "step": 6347, "time_per_iteration": 2.646730422973633 }, { "auxiliary_loss_clip": 0.0112204, "auxiliary_loss_mlp": 0.01039786, "balance_loss_clip": 1.04638994, "balance_loss_mlp": 1.0253861, "epoch": 0.3816624079362694, "flos": 19281050029440.0, "grad_norm": 3.670871038619067, "language_loss": 0.74398822, "learning_rate": 2.8370409367436813e-06, "loss": 0.76560652, "num_input_tokens_seen": 136232700, "step": 6348, "time_per_iteration": 2.651153802871704 }, { "auxiliary_loss_clip": 0.01109969, "auxiliary_loss_mlp": 0.01037685, "balance_loss_clip": 1.04792547, "balance_loss_mlp": 1.0233444, "epoch": 0.38172253118893734, "flos": 21177564220800.0, "grad_norm": 2.7978232906816665, "language_loss": 0.87172502, "learning_rate": 2.836687208908142e-06, "loss": 0.89320159, "num_input_tokens_seen": 136248975, "step": 6349, "time_per_iteration": 2.693459987640381 }, { "auxiliary_loss_clip": 0.0112098, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.04788637, "balance_loss_mlp": 1.02244771, "epoch": 0.3817826544416053, "flos": 17529219820800.0, "grad_norm": 1.7341599494512197, "language_loss": 0.76554048, "learning_rate": 2.836333449345341e-06, "loss": 0.78712171, "num_input_tokens_seen": 136266710, "step": 6350, "time_per_iteration": 2.6194076538085938 }, { "auxiliary_loss_clip": 0.01104228, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.04922175, "balance_loss_mlp": 1.01640153, "epoch": 0.38184277769427327, "flos": 16326535714560.0, "grad_norm": 2.525722230514251, "language_loss": 0.75608248, "learning_rate": 2.8359796580686907e-06, "loss": 0.77744693, "num_input_tokens_seen": 136284445, "step": 6351, "time_per_iteration": 2.723487138748169 }, { "auxiliary_loss_clip": 0.01122109, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.04607773, "balance_loss_mlp": 1.02048135, "epoch": 0.38190290094694124, "flos": 30443450382720.0, "grad_norm": 2.201358799690427, "language_loss": 0.74001205, "learning_rate": 2.8356258350916085e-06, "loss": 0.76160336, "num_input_tokens_seen": 136305730, "step": 6352, "time_per_iteration": 2.6779909133911133 }, { "auxiliary_loss_clip": 0.01093469, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.04185915, "balance_loss_mlp": 1.02093625, "epoch": 0.3819630241996092, "flos": 14209924936320.0, "grad_norm": 1.7014377772216425, "language_loss": 0.64249897, "learning_rate": 2.8352719804275104e-06, "loss": 0.66378438, "num_input_tokens_seen": 136323850, "step": 6353, "time_per_iteration": 2.731860399246216 }, { "auxiliary_loss_clip": 0.01133265, "auxiliary_loss_mlp": 0.01039549, "balance_loss_clip": 1.04809213, "balance_loss_mlp": 1.02529204, "epoch": 0.38202314745227717, "flos": 25009699536000.0, "grad_norm": 2.7523604394748644, "language_loss": 0.83447051, "learning_rate": 2.834918094089816e-06, "loss": 0.85619861, "num_input_tokens_seen": 136344880, "step": 6354, "time_per_iteration": 2.665891170501709 }, { "auxiliary_loss_clip": 0.01132291, "auxiliary_loss_mlp": 0.01034862, "balance_loss_clip": 1.04866302, "balance_loss_mlp": 1.02162409, "epoch": 0.38208327070494513, "flos": 20814507504000.0, "grad_norm": 16.091226432139102, "language_loss": 0.80633152, "learning_rate": 2.834564176091943e-06, "loss": 0.82800299, "num_input_tokens_seen": 136366060, "step": 6355, "time_per_iteration": 2.6580965518951416 }, { "auxiliary_loss_clip": 0.01092469, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.04551625, "balance_loss_mlp": 1.02263832, "epoch": 0.3821433939576131, "flos": 22637727993600.0, "grad_norm": 1.8508447811900344, "language_loss": 0.75970227, "learning_rate": 2.8342102264473125e-06, "loss": 0.78099722, "num_input_tokens_seen": 136385625, "step": 6356, "time_per_iteration": 2.7381057739257812 }, { "auxiliary_loss_clip": 0.01123851, "auxiliary_loss_mlp": 0.00772749, "balance_loss_clip": 1.04802036, "balance_loss_mlp": 1.00034022, "epoch": 0.38220351721028106, "flos": 26869872142080.0, "grad_norm": 2.3854964939919188, "language_loss": 0.81208009, "learning_rate": 2.833856245169348e-06, "loss": 0.8310461, "num_input_tokens_seen": 136405750, "step": 6357, "time_per_iteration": 2.8209376335144043 }, { "auxiliary_loss_clip": 0.01118527, "auxiliary_loss_mlp": 0.01044748, "balance_loss_clip": 1.05246222, "balance_loss_mlp": 1.02842796, "epoch": 0.38226364046294903, "flos": 23367468700800.0, "grad_norm": 2.215929075758269, "language_loss": 0.77378345, "learning_rate": 2.8335022322714695e-06, "loss": 0.79541618, "num_input_tokens_seen": 136426085, "step": 6358, "time_per_iteration": 2.7004640102386475 }, { "auxiliary_loss_clip": 0.01115504, "auxiliary_loss_mlp": 0.01047061, "balance_loss_clip": 1.0469476, "balance_loss_mlp": 1.03118849, "epoch": 0.382323763715617, "flos": 19646225648640.0, "grad_norm": 3.6635579737055837, "language_loss": 0.78477705, "learning_rate": 2.8331481877671036e-06, "loss": 0.80640268, "num_input_tokens_seen": 136442670, "step": 6359, "time_per_iteration": 4.184551954269409 }, { "auxiliary_loss_clip": 0.01065181, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.03820515, "balance_loss_mlp": 1.03462481, "epoch": 0.38238388696828496, "flos": 54124741232640.0, "grad_norm": 1.6779400536158158, "language_loss": 0.69735414, "learning_rate": 2.8327941116696754e-06, "loss": 0.71852612, "num_input_tokens_seen": 136465730, "step": 6360, "time_per_iteration": 3.1072845458984375 }, { "auxiliary_loss_clip": 0.01102455, "auxiliary_loss_mlp": 0.01037366, "balance_loss_clip": 1.04502857, "balance_loss_mlp": 1.02189279, "epoch": 0.382444010220953, "flos": 24936190352640.0, "grad_norm": 1.5790785802582266, "language_loss": 0.79362941, "learning_rate": 2.83244000399261e-06, "loss": 0.81502759, "num_input_tokens_seen": 136487215, "step": 6361, "time_per_iteration": 4.285314559936523 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.01043827, "balance_loss_clip": 1.04649949, "balance_loss_mlp": 1.02906859, "epoch": 0.38250413347362094, "flos": 42337351209600.0, "grad_norm": 1.9067122847602551, "language_loss": 0.65606177, "learning_rate": 2.832085864749337e-06, "loss": 0.67761117, "num_input_tokens_seen": 136510365, "step": 6362, "time_per_iteration": 2.8447117805480957 }, { "auxiliary_loss_clip": 0.0113439, "auxiliary_loss_mlp": 0.01035947, "balance_loss_clip": 1.0483737, "balance_loss_mlp": 1.01978207, "epoch": 0.3825642567262889, "flos": 16289224462080.0, "grad_norm": 2.3383155012254284, "language_loss": 0.82138497, "learning_rate": 2.8317316939532848e-06, "loss": 0.84308833, "num_input_tokens_seen": 136527100, "step": 6363, "time_per_iteration": 4.166736602783203 }, { "auxiliary_loss_clip": 0.01075728, "auxiliary_loss_mlp": 0.01042552, "balance_loss_clip": 1.04349709, "balance_loss_mlp": 1.02707291, "epoch": 0.3826243799789569, "flos": 45654778586880.0, "grad_norm": 2.1311203141010835, "language_loss": 0.59044886, "learning_rate": 2.8313774916178825e-06, "loss": 0.61163169, "num_input_tokens_seen": 136550870, "step": 6364, "time_per_iteration": 3.006801128387451 }, { "auxiliary_loss_clip": 0.01122076, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.05097353, "balance_loss_mlp": 1.02542353, "epoch": 0.38268450323162484, "flos": 25301581453440.0, "grad_norm": 1.9239689491626994, "language_loss": 0.68903065, "learning_rate": 2.8310232577565635e-06, "loss": 0.7106635, "num_input_tokens_seen": 136569895, "step": 6365, "time_per_iteration": 2.695068597793579 }, { "auxiliary_loss_clip": 0.01123716, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.04955769, "balance_loss_mlp": 1.02366817, "epoch": 0.3827446264842928, "flos": 21836022387840.0, "grad_norm": 2.0334034116186137, "language_loss": 0.73193848, "learning_rate": 2.830668992382758e-06, "loss": 0.75357372, "num_input_tokens_seen": 136588585, "step": 6366, "time_per_iteration": 4.418980598449707 }, { "auxiliary_loss_clip": 0.01115964, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.04846239, "balance_loss_mlp": 1.02265882, "epoch": 0.38280474973696077, "flos": 25734591907200.0, "grad_norm": 2.4539991484931645, "language_loss": 0.68623614, "learning_rate": 2.830314695509902e-06, "loss": 0.70777929, "num_input_tokens_seen": 136606640, "step": 6367, "time_per_iteration": 2.6878082752227783 }, { "auxiliary_loss_clip": 0.01125961, "auxiliary_loss_mlp": 0.01037618, "balance_loss_clip": 1.05120409, "balance_loss_mlp": 1.02256823, "epoch": 0.38286487298962874, "flos": 24895934184960.0, "grad_norm": 2.196344444241347, "language_loss": 0.64423102, "learning_rate": 2.82996036715143e-06, "loss": 0.66586685, "num_input_tokens_seen": 136624940, "step": 6368, "time_per_iteration": 2.6698646545410156 }, { "auxiliary_loss_clip": 0.01139795, "auxiliary_loss_mlp": 0.01040116, "balance_loss_clip": 1.05269098, "balance_loss_mlp": 1.02390361, "epoch": 0.3829249962422967, "flos": 28543703967360.0, "grad_norm": 1.346024597035963, "language_loss": 0.684017, "learning_rate": 2.8296060073207763e-06, "loss": 0.70581615, "num_input_tokens_seen": 136645540, "step": 6369, "time_per_iteration": 2.7156169414520264 }, { "auxiliary_loss_clip": 0.01084469, "auxiliary_loss_mlp": 0.01039929, "balance_loss_clip": 1.04267466, "balance_loss_mlp": 1.02391946, "epoch": 0.38298511949496467, "flos": 21471205904640.0, "grad_norm": 1.7824237306329542, "language_loss": 0.78701794, "learning_rate": 2.8292516160313804e-06, "loss": 0.80826187, "num_input_tokens_seen": 136664530, "step": 6370, "time_per_iteration": 2.7351901531219482 }, { "auxiliary_loss_clip": 0.01121027, "auxiliary_loss_mlp": 0.01050163, "balance_loss_clip": 1.04909503, "balance_loss_mlp": 1.03279376, "epoch": 0.38304524274763263, "flos": 31679998035840.0, "grad_norm": 2.5095706519371794, "language_loss": 0.65098304, "learning_rate": 2.8288971932966805e-06, "loss": 0.67269492, "num_input_tokens_seen": 136682315, "step": 6371, "time_per_iteration": 2.739689350128174 }, { "auxiliary_loss_clip": 0.01110581, "auxiliary_loss_mlp": 0.01041968, "balance_loss_clip": 1.04938042, "balance_loss_mlp": 1.02471852, "epoch": 0.3831053660003006, "flos": 25076816098560.0, "grad_norm": 3.269308088463154, "language_loss": 0.7304002, "learning_rate": 2.8285427391301155e-06, "loss": 0.75192571, "num_input_tokens_seen": 136701185, "step": 6372, "time_per_iteration": 2.7497966289520264 }, { "auxiliary_loss_clip": 0.01127864, "auxiliary_loss_mlp": 0.01034223, "balance_loss_clip": 1.05050421, "balance_loss_mlp": 1.01848698, "epoch": 0.38316548925296856, "flos": 23259018562560.0, "grad_norm": 1.83316702621751, "language_loss": 0.8491025, "learning_rate": 2.8281882535451266e-06, "loss": 0.87072337, "num_input_tokens_seen": 136721265, "step": 6373, "time_per_iteration": 2.6510777473449707 }, { "auxiliary_loss_clip": 0.01084717, "auxiliary_loss_mlp": 0.01048262, "balance_loss_clip": 1.0416218, "balance_loss_mlp": 1.0316565, "epoch": 0.3832256125056366, "flos": 34423465991040.0, "grad_norm": 2.287485479433922, "language_loss": 0.74893212, "learning_rate": 2.8278337365551567e-06, "loss": 0.770262, "num_input_tokens_seen": 136741885, "step": 6374, "time_per_iteration": 2.8658056259155273 }, { "auxiliary_loss_clip": 0.01130215, "auxiliary_loss_mlp": 0.01042427, "balance_loss_clip": 1.05264366, "balance_loss_mlp": 1.02613068, "epoch": 0.38328573575830455, "flos": 21762764599680.0, "grad_norm": 7.5426595342284735, "language_loss": 0.75737238, "learning_rate": 2.8274791881736485e-06, "loss": 0.77909875, "num_input_tokens_seen": 136760905, "step": 6375, "time_per_iteration": 2.6622958183288574 }, { "auxiliary_loss_clip": 0.01126708, "auxiliary_loss_mlp": 0.01039776, "balance_loss_clip": 1.05043924, "balance_loss_mlp": 1.0244453, "epoch": 0.3833458590109725, "flos": 17380010724480.0, "grad_norm": 2.1246389624552435, "language_loss": 0.72777182, "learning_rate": 2.8271246084140457e-06, "loss": 0.74943662, "num_input_tokens_seen": 136777240, "step": 6376, "time_per_iteration": 2.6562421321868896 }, { "auxiliary_loss_clip": 0.01122147, "auxiliary_loss_mlp": 0.01039822, "balance_loss_clip": 1.04791379, "balance_loss_mlp": 1.02381194, "epoch": 0.3834059822636405, "flos": 29424557191680.0, "grad_norm": 1.7414598633373413, "language_loss": 0.67441249, "learning_rate": 2.826769997289796e-06, "loss": 0.69603217, "num_input_tokens_seen": 136801040, "step": 6377, "time_per_iteration": 2.779766798019409 }, { "auxiliary_loss_clip": 0.01110002, "auxiliary_loss_mlp": 0.01041228, "balance_loss_clip": 1.05152845, "balance_loss_mlp": 1.02421689, "epoch": 0.38346610551630844, "flos": 21470739027840.0, "grad_norm": 2.377659826482013, "language_loss": 0.73287642, "learning_rate": 2.826415354814344e-06, "loss": 0.75438869, "num_input_tokens_seen": 136819495, "step": 6378, "time_per_iteration": 2.7345829010009766 }, { "auxiliary_loss_clip": 0.01085335, "auxiliary_loss_mlp": 0.01042694, "balance_loss_clip": 1.0479784, "balance_loss_mlp": 1.02707767, "epoch": 0.3835262287689764, "flos": 27561224188800.0, "grad_norm": 2.283576437082984, "language_loss": 0.69473612, "learning_rate": 2.8260606810011396e-06, "loss": 0.71601641, "num_input_tokens_seen": 136838840, "step": 6379, "time_per_iteration": 2.7592358589172363 }, { "auxiliary_loss_clip": 0.01124706, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.0516969, "balance_loss_mlp": 1.02094209, "epoch": 0.3835863520216444, "flos": 15523716787200.0, "grad_norm": 1.8393672130560537, "language_loss": 0.83356249, "learning_rate": 2.8257059758636315e-06, "loss": 0.85517132, "num_input_tokens_seen": 136854425, "step": 6380, "time_per_iteration": 2.6572370529174805 }, { "auxiliary_loss_clip": 0.01135434, "auxiliary_loss_mlp": 0.01035321, "balance_loss_clip": 1.05187774, "balance_loss_mlp": 1.02010989, "epoch": 0.38364647527431234, "flos": 21904934630400.0, "grad_norm": 1.5891747666862521, "language_loss": 0.8141042, "learning_rate": 2.8253512394152697e-06, "loss": 0.83581179, "num_input_tokens_seen": 136874355, "step": 6381, "time_per_iteration": 2.7251663208007812 }, { "auxiliary_loss_clip": 0.01057344, "auxiliary_loss_mlp": 0.01005901, "balance_loss_clip": 1.02759361, "balance_loss_mlp": 1.00418437, "epoch": 0.3837065985269803, "flos": 65534927558400.0, "grad_norm": 0.7954141143291842, "language_loss": 0.60376751, "learning_rate": 2.8249964716695068e-06, "loss": 0.62440002, "num_input_tokens_seen": 136937475, "step": 6382, "time_per_iteration": 3.1750948429107666 }, { "auxiliary_loss_clip": 0.01139607, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.05060625, "balance_loss_mlp": 1.02099442, "epoch": 0.38376672177964827, "flos": 28256598558720.0, "grad_norm": 3.8324285625149925, "language_loss": 0.66432369, "learning_rate": 2.824641672639794e-06, "loss": 0.68608773, "num_input_tokens_seen": 136955805, "step": 6383, "time_per_iteration": 2.7543957233428955 }, { "auxiliary_loss_clip": 0.01103794, "auxiliary_loss_mlp": 0.01039577, "balance_loss_clip": 1.04783142, "balance_loss_mlp": 1.02375221, "epoch": 0.38382684503231623, "flos": 20631363033600.0, "grad_norm": 2.110615575498957, "language_loss": 0.75144917, "learning_rate": 2.824286842339587e-06, "loss": 0.77288288, "num_input_tokens_seen": 136975240, "step": 6384, "time_per_iteration": 2.7796735763549805 }, { "auxiliary_loss_clip": 0.01122869, "auxiliary_loss_mlp": 0.01040365, "balance_loss_clip": 1.05156231, "balance_loss_mlp": 1.02510643, "epoch": 0.3838869682849842, "flos": 19605825826560.0, "grad_norm": 1.5394774946197278, "language_loss": 0.76096714, "learning_rate": 2.823931980782341e-06, "loss": 0.78259945, "num_input_tokens_seen": 136994985, "step": 6385, "time_per_iteration": 2.6831300258636475 }, { "auxiliary_loss_clip": 0.01046831, "auxiliary_loss_mlp": 0.01001133, "balance_loss_clip": 1.02648735, "balance_loss_mlp": 0.99943984, "epoch": 0.38394709153765216, "flos": 56556110891520.0, "grad_norm": 0.9063295744618779, "language_loss": 0.66955769, "learning_rate": 2.82357708798151e-06, "loss": 0.69003725, "num_input_tokens_seen": 137046290, "step": 6386, "time_per_iteration": 3.0693411827087402 }, { "auxiliary_loss_clip": 0.0109652, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.04551756, "balance_loss_mlp": 1.02686286, "epoch": 0.3840072147903202, "flos": 15888748752000.0, "grad_norm": 1.7986188221191803, "language_loss": 0.7215755, "learning_rate": 2.8232221639505547e-06, "loss": 0.74295932, "num_input_tokens_seen": 137064725, "step": 6387, "time_per_iteration": 2.736774206161499 }, { "auxiliary_loss_clip": 0.01134624, "auxiliary_loss_mlp": 0.01044946, "balance_loss_clip": 1.05156994, "balance_loss_mlp": 1.03039086, "epoch": 0.38406733804298815, "flos": 28218030330240.0, "grad_norm": 1.6374516085838389, "language_loss": 0.8088249, "learning_rate": 2.822867208702932e-06, "loss": 0.83062065, "num_input_tokens_seen": 137086030, "step": 6388, "time_per_iteration": 2.782958507537842 }, { "auxiliary_loss_clip": 0.01103471, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.04727554, "balance_loss_mlp": 1.03298843, "epoch": 0.3841274612956561, "flos": 18223588609920.0, "grad_norm": 1.7872750649564642, "language_loss": 0.76085746, "learning_rate": 2.8225122222521026e-06, "loss": 0.78236812, "num_input_tokens_seen": 137105400, "step": 6389, "time_per_iteration": 2.6644833087921143 }, { "auxiliary_loss_clip": 0.01119906, "auxiliary_loss_mlp": 0.0104877, "balance_loss_clip": 1.05389404, "balance_loss_mlp": 1.03203344, "epoch": 0.3841875845483241, "flos": 19792884879360.0, "grad_norm": 4.9507505317589775, "language_loss": 0.76550084, "learning_rate": 2.8221572046115273e-06, "loss": 0.78718758, "num_input_tokens_seen": 137124985, "step": 6390, "time_per_iteration": 2.825714588165283 }, { "auxiliary_loss_clip": 0.01090482, "auxiliary_loss_mlp": 0.01048203, "balance_loss_clip": 1.04517913, "balance_loss_mlp": 1.03196096, "epoch": 0.38424770780099204, "flos": 29898829393920.0, "grad_norm": 1.7614871223783444, "language_loss": 0.70377523, "learning_rate": 2.821802155794668e-06, "loss": 0.72516215, "num_input_tokens_seen": 137146745, "step": 6391, "time_per_iteration": 2.918065309524536 }, { "auxiliary_loss_clip": 0.01125443, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.04874265, "balance_loss_mlp": 1.02158153, "epoch": 0.38430783105366, "flos": 20813717404800.0, "grad_norm": 1.7948670510085722, "language_loss": 0.84005457, "learning_rate": 2.8214470758149884e-06, "loss": 0.86167878, "num_input_tokens_seen": 137163195, "step": 6392, "time_per_iteration": 2.679427146911621 }, { "auxiliary_loss_clip": 0.01122701, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.04846168, "balance_loss_mlp": 1.0227809, "epoch": 0.384367954306328, "flos": 10998577399680.0, "grad_norm": 2.3141685884805145, "language_loss": 0.6062203, "learning_rate": 2.8210919646859536e-06, "loss": 0.62781858, "num_input_tokens_seen": 137179330, "step": 6393, "time_per_iteration": 2.6622374057769775 }, { "auxiliary_loss_clip": 0.01110672, "auxiliary_loss_mlp": 0.01036894, "balance_loss_clip": 1.04954767, "balance_loss_mlp": 1.02025223, "epoch": 0.38442807755899594, "flos": 25338030779520.0, "grad_norm": 1.7908313499382054, "language_loss": 0.70639426, "learning_rate": 2.820736822421029e-06, "loss": 0.72786993, "num_input_tokens_seen": 137198655, "step": 6394, "time_per_iteration": 2.7460365295410156 }, { "auxiliary_loss_clip": 0.01123613, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.04763663, "balance_loss_mlp": 1.01871169, "epoch": 0.3844882008116639, "flos": 21069760527360.0, "grad_norm": 2.646318489707099, "language_loss": 0.81774974, "learning_rate": 2.8203816490336822e-06, "loss": 0.83933747, "num_input_tokens_seen": 137217120, "step": 6395, "time_per_iteration": 2.676023006439209 }, { "auxiliary_loss_clip": 0.01129196, "auxiliary_loss_mlp": 0.01046949, "balance_loss_clip": 1.05485177, "balance_loss_mlp": 1.03209007, "epoch": 0.38454832406433187, "flos": 17963235855360.0, "grad_norm": 1.9755185808990787, "language_loss": 0.71031433, "learning_rate": 2.8200264445373813e-06, "loss": 0.73207581, "num_input_tokens_seen": 137234410, "step": 6396, "time_per_iteration": 2.7082455158233643 }, { "auxiliary_loss_clip": 0.01044031, "auxiliary_loss_mlp": 0.0100801, "balance_loss_clip": 1.02689695, "balance_loss_mlp": 1.00657308, "epoch": 0.38460844731699984, "flos": 67924999555200.0, "grad_norm": 0.8839433118134116, "language_loss": 0.59671199, "learning_rate": 2.8196712089455954e-06, "loss": 0.61723238, "num_input_tokens_seen": 137294940, "step": 6397, "time_per_iteration": 3.2412428855895996 }, { "auxiliary_loss_clip": 0.01137376, "auxiliary_loss_mlp": 0.01035554, "balance_loss_clip": 1.05209756, "balance_loss_mlp": 1.02044976, "epoch": 0.3846685705696678, "flos": 25849075530240.0, "grad_norm": 2.648974669995796, "language_loss": 0.85017276, "learning_rate": 2.819315942271794e-06, "loss": 0.87190199, "num_input_tokens_seen": 137315035, "step": 6398, "time_per_iteration": 2.7374656200408936 }, { "auxiliary_loss_clip": 0.01136492, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.0517211, "balance_loss_mlp": 1.0165, "epoch": 0.38472869382233577, "flos": 16290194129280.0, "grad_norm": 2.1032431430060075, "language_loss": 0.79989493, "learning_rate": 2.8189606445294515e-06, "loss": 0.82156688, "num_input_tokens_seen": 137333155, "step": 6399, "time_per_iteration": 4.446218729019165 }, { "auxiliary_loss_clip": 0.0113807, "auxiliary_loss_mlp": 0.00773562, "balance_loss_clip": 1.05109119, "balance_loss_mlp": 1.00025833, "epoch": 0.38478881707500373, "flos": 19353122668800.0, "grad_norm": 3.0376300513317416, "language_loss": 0.67328328, "learning_rate": 2.818605315732038e-06, "loss": 0.69239962, "num_input_tokens_seen": 137351515, "step": 6400, "time_per_iteration": 2.6920905113220215 }, { "auxiliary_loss_clip": 0.01122811, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.05546772, "balance_loss_mlp": 1.0264008, "epoch": 0.38484894032767175, "flos": 24860849575680.0, "grad_norm": 11.158483612058907, "language_loss": 0.73623443, "learning_rate": 2.81824995589303e-06, "loss": 0.75788283, "num_input_tokens_seen": 137371255, "step": 6401, "time_per_iteration": 4.2371673583984375 }, { "auxiliary_loss_clip": 0.01102005, "auxiliary_loss_mlp": 0.01039851, "balance_loss_clip": 1.04852486, "balance_loss_mlp": 1.02387738, "epoch": 0.3849090635803397, "flos": 14501806853760.0, "grad_norm": 2.0006804524577233, "language_loss": 0.72059876, "learning_rate": 2.8178945650259012e-06, "loss": 0.74201727, "num_input_tokens_seen": 137388980, "step": 6402, "time_per_iteration": 2.686413288116455 }, { "auxiliary_loss_clip": 0.0113478, "auxiliary_loss_mlp": 0.01035082, "balance_loss_clip": 1.05094552, "balance_loss_mlp": 1.02016854, "epoch": 0.3849691868330077, "flos": 18515865576960.0, "grad_norm": 2.094788133183166, "language_loss": 0.82884681, "learning_rate": 2.817539143144128e-06, "loss": 0.85054541, "num_input_tokens_seen": 137406885, "step": 6403, "time_per_iteration": 4.234680891036987 }, { "auxiliary_loss_clip": 0.01078109, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.04205656, "balance_loss_mlp": 1.02466702, "epoch": 0.38502931008567565, "flos": 21616392677760.0, "grad_norm": 4.587008789601206, "language_loss": 0.82845348, "learning_rate": 2.817183690261189e-06, "loss": 0.84964037, "num_input_tokens_seen": 137425535, "step": 6404, "time_per_iteration": 2.777756452560425 }, { "auxiliary_loss_clip": 0.0111195, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.04970074, "balance_loss_mlp": 1.02046084, "epoch": 0.3850894333383436, "flos": 25415346804480.0, "grad_norm": 2.6287869212560646, "language_loss": 0.69417107, "learning_rate": 2.816828206390563e-06, "loss": 0.71563923, "num_input_tokens_seen": 137447700, "step": 6405, "time_per_iteration": 4.478301286697388 }, { "auxiliary_loss_clip": 0.01102381, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.0438571, "balance_loss_mlp": 1.02414417, "epoch": 0.3851495565910116, "flos": 20227870581120.0, "grad_norm": 1.9306681180439358, "language_loss": 0.79248095, "learning_rate": 2.816472691545729e-06, "loss": 0.81388557, "num_input_tokens_seen": 137462245, "step": 6406, "time_per_iteration": 2.7157816886901855 }, { "auxiliary_loss_clip": 0.01129296, "auxiliary_loss_mlp": 0.01040841, "balance_loss_clip": 1.05465746, "balance_loss_mlp": 1.02483082, "epoch": 0.38520967984367954, "flos": 16508459122560.0, "grad_norm": 5.929375109580111, "language_loss": 0.84107637, "learning_rate": 2.8161171457401694e-06, "loss": 0.86277771, "num_input_tokens_seen": 137476455, "step": 6407, "time_per_iteration": 2.6058037281036377 }, { "auxiliary_loss_clip": 0.01049614, "auxiliary_loss_mlp": 0.00999678, "balance_loss_clip": 1.03001904, "balance_loss_mlp": 0.99828893, "epoch": 0.3852698030963475, "flos": 61313772971520.0, "grad_norm": 0.845548946049954, "language_loss": 0.64919412, "learning_rate": 2.815761568987365e-06, "loss": 0.66968703, "num_input_tokens_seen": 137539845, "step": 6408, "time_per_iteration": 3.2015879154205322 }, { "auxiliary_loss_clip": 0.01110915, "auxiliary_loss_mlp": 0.01042045, "balance_loss_clip": 1.05201948, "balance_loss_mlp": 1.02547526, "epoch": 0.3853299263490155, "flos": 22893016930560.0, "grad_norm": 1.5734517214124462, "language_loss": 0.73444313, "learning_rate": 2.8154059613008e-06, "loss": 0.75597274, "num_input_tokens_seen": 137559880, "step": 6409, "time_per_iteration": 2.683310031890869 }, { "auxiliary_loss_clip": 0.01099042, "auxiliary_loss_mlp": 0.01052587, "balance_loss_clip": 1.05162942, "balance_loss_mlp": 1.03458679, "epoch": 0.38539004960168344, "flos": 20047491457920.0, "grad_norm": 3.095928763270071, "language_loss": 0.70505756, "learning_rate": 2.81505032269396e-06, "loss": 0.72657388, "num_input_tokens_seen": 137578225, "step": 6410, "time_per_iteration": 2.7694053649902344 }, { "auxiliary_loss_clip": 0.01018797, "auxiliary_loss_mlp": 0.00754046, "balance_loss_clip": 1.02754462, "balance_loss_mlp": 1.00070059, "epoch": 0.3854501728543514, "flos": 68730691570560.0, "grad_norm": 0.6824056349925876, "language_loss": 0.6019417, "learning_rate": 2.81469465318033e-06, "loss": 0.61967015, "num_input_tokens_seen": 137645770, "step": 6411, "time_per_iteration": 3.3692543506622314 }, { "auxiliary_loss_clip": 0.01091571, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.04337883, "balance_loss_mlp": 1.01451063, "epoch": 0.38551029610701937, "flos": 20485027025280.0, "grad_norm": 2.4386958956664344, "language_loss": 0.78219938, "learning_rate": 2.814338952773397e-06, "loss": 0.80340695, "num_input_tokens_seen": 137664090, "step": 6412, "time_per_iteration": 2.7462196350097656 }, { "auxiliary_loss_clip": 0.01097982, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.04309821, "balance_loss_mlp": 1.01995587, "epoch": 0.38557041935968733, "flos": 23471788775040.0, "grad_norm": 2.0249224045322802, "language_loss": 0.78112727, "learning_rate": 2.8139832214866493e-06, "loss": 0.80248463, "num_input_tokens_seen": 137683190, "step": 6413, "time_per_iteration": 2.768624782562256 }, { "auxiliary_loss_clip": 0.01056912, "auxiliary_loss_mlp": 0.01003998, "balance_loss_clip": 1.02733278, "balance_loss_mlp": 1.00254369, "epoch": 0.38563054261235535, "flos": 63966636869760.0, "grad_norm": 0.8082958368118873, "language_loss": 0.61342072, "learning_rate": 2.813627459333576e-06, "loss": 0.63402981, "num_input_tokens_seen": 137737315, "step": 6414, "time_per_iteration": 2.983466625213623 }, { "auxiliary_loss_clip": 0.01103716, "auxiliary_loss_mlp": 0.01038577, "balance_loss_clip": 1.05065155, "balance_loss_mlp": 1.02302015, "epoch": 0.3856906658650233, "flos": 23987789602560.0, "grad_norm": 2.2111312580879106, "language_loss": 0.77225536, "learning_rate": 2.8132716663276685e-06, "loss": 0.79367828, "num_input_tokens_seen": 137753535, "step": 6415, "time_per_iteration": 2.7486205101013184 }, { "auxiliary_loss_clip": 0.01109368, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.04894936, "balance_loss_mlp": 1.01676726, "epoch": 0.3857507891176913, "flos": 25007436979200.0, "grad_norm": 1.644505635534703, "language_loss": 0.80036473, "learning_rate": 2.8129158424824173e-06, "loss": 0.82176626, "num_input_tokens_seen": 137773405, "step": 6416, "time_per_iteration": 2.709200859069824 }, { "auxiliary_loss_clip": 0.0112133, "auxiliary_loss_mlp": 0.00771665, "balance_loss_clip": 1.04777813, "balance_loss_mlp": 1.00020468, "epoch": 0.38581091237035925, "flos": 21536778182400.0, "grad_norm": 1.8974153334913886, "language_loss": 0.78746861, "learning_rate": 2.8125599878113155e-06, "loss": 0.80639857, "num_input_tokens_seen": 137790810, "step": 6417, "time_per_iteration": 2.6839869022369385 }, { "auxiliary_loss_clip": 0.01106617, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.04771507, "balance_loss_mlp": 1.02424121, "epoch": 0.3858710356230272, "flos": 17383889393280.0, "grad_norm": 1.8492847143532247, "language_loss": 0.80066824, "learning_rate": 2.8122041023278583e-06, "loss": 0.82211387, "num_input_tokens_seen": 137810265, "step": 6418, "time_per_iteration": 2.709463119506836 }, { "auxiliary_loss_clip": 0.01106426, "auxiliary_loss_mlp": 0.01035927, "balance_loss_clip": 1.04606509, "balance_loss_mlp": 1.02115691, "epoch": 0.3859311588756952, "flos": 20339588856960.0, "grad_norm": 2.0121704661475524, "language_loss": 0.79591382, "learning_rate": 2.8118481860455407e-06, "loss": 0.81733727, "num_input_tokens_seen": 137828580, "step": 6419, "time_per_iteration": 2.687030553817749 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.04662013, "balance_loss_mlp": 1.0194031, "epoch": 0.38599128212836314, "flos": 26321157002880.0, "grad_norm": 2.202509680177809, "language_loss": 0.67581224, "learning_rate": 2.8114922389778573e-06, "loss": 0.69726223, "num_input_tokens_seen": 137846145, "step": 6420, "time_per_iteration": 2.7517049312591553 }, { "auxiliary_loss_clip": 0.01089731, "auxiliary_loss_mlp": 0.01053637, "balance_loss_clip": 1.04479241, "balance_loss_mlp": 1.03771043, "epoch": 0.3860514053810311, "flos": 13553837066880.0, "grad_norm": 2.406147976104497, "language_loss": 0.81137526, "learning_rate": 2.8111362611383076e-06, "loss": 0.83280897, "num_input_tokens_seen": 137863705, "step": 6421, "time_per_iteration": 2.970040798187256 }, { "auxiliary_loss_clip": 0.01108309, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.04625583, "balance_loss_mlp": 1.02510345, "epoch": 0.3861115286336991, "flos": 20954271323520.0, "grad_norm": 2.6092074943148797, "language_loss": 0.71989834, "learning_rate": 2.8107802525403886e-06, "loss": 0.74138188, "num_input_tokens_seen": 137880285, "step": 6422, "time_per_iteration": 2.690490961074829 }, { "auxiliary_loss_clip": 0.01104575, "auxiliary_loss_mlp": 0.0104152, "balance_loss_clip": 1.04663455, "balance_loss_mlp": 1.02759588, "epoch": 0.38617165188636704, "flos": 16362697731840.0, "grad_norm": 1.6942063430957965, "language_loss": 0.66644311, "learning_rate": 2.8104242131976025e-06, "loss": 0.687904, "num_input_tokens_seen": 137898335, "step": 6423, "time_per_iteration": 2.6189329624176025 }, { "auxiliary_loss_clip": 0.01128312, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.05139875, "balance_loss_mlp": 1.02860618, "epoch": 0.386231775139035, "flos": 34787276893440.0, "grad_norm": 2.1536039728580394, "language_loss": 0.68359423, "learning_rate": 2.810068143123449e-06, "loss": 0.70530522, "num_input_tokens_seen": 137918605, "step": 6424, "time_per_iteration": 2.7609992027282715 }, { "auxiliary_loss_clip": 0.01098796, "auxiliary_loss_mlp": 0.01038309, "balance_loss_clip": 1.04750848, "balance_loss_mlp": 1.02387285, "epoch": 0.38629189839170297, "flos": 21726171619200.0, "grad_norm": 1.4481478329406698, "language_loss": 0.72367114, "learning_rate": 2.809712042331429e-06, "loss": 0.7450422, "num_input_tokens_seen": 137938245, "step": 6425, "time_per_iteration": 2.7069387435913086 }, { "auxiliary_loss_clip": 0.01099551, "auxiliary_loss_mlp": 0.00773141, "balance_loss_clip": 1.0428803, "balance_loss_mlp": 1.00013173, "epoch": 0.38635202164437094, "flos": 27923634460800.0, "grad_norm": 2.52438881915832, "language_loss": 0.80258477, "learning_rate": 2.8093559108350484e-06, "loss": 0.82131171, "num_input_tokens_seen": 137956770, "step": 6426, "time_per_iteration": 2.8976056575775146 }, { "auxiliary_loss_clip": 0.01125602, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.04929447, "balance_loss_mlp": 1.02013016, "epoch": 0.38641214489703896, "flos": 23586631534080.0, "grad_norm": 2.2578291383073825, "language_loss": 0.7536087, "learning_rate": 2.80899974864781e-06, "loss": 0.77521622, "num_input_tokens_seen": 137977040, "step": 6427, "time_per_iteration": 2.7281436920166016 }, { "auxiliary_loss_clip": 0.01075932, "auxiliary_loss_mlp": 0.01057335, "balance_loss_clip": 1.04142189, "balance_loss_mlp": 1.04013276, "epoch": 0.3864722681497069, "flos": 12641239198080.0, "grad_norm": 2.0875975256988055, "language_loss": 0.69435054, "learning_rate": 2.8086435557832203e-06, "loss": 0.71568322, "num_input_tokens_seen": 137993545, "step": 6428, "time_per_iteration": 2.7289116382598877 }, { "auxiliary_loss_clip": 0.01113154, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.04947257, "balance_loss_mlp": 1.02729535, "epoch": 0.3865323914023749, "flos": 17598922162560.0, "grad_norm": 2.847119477349317, "language_loss": 0.8444519, "learning_rate": 2.8082873322547863e-06, "loss": 0.86600363, "num_input_tokens_seen": 138010140, "step": 6429, "time_per_iteration": 2.7385170459747314 }, { "auxiliary_loss_clip": 0.01110797, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.04555535, "balance_loss_mlp": 1.02423429, "epoch": 0.38659251465504285, "flos": 18478949374080.0, "grad_norm": 2.174010980525696, "language_loss": 0.80673695, "learning_rate": 2.807931078076015e-06, "loss": 0.82823092, "num_input_tokens_seen": 138028880, "step": 6430, "time_per_iteration": 2.660228967666626 }, { "auxiliary_loss_clip": 0.0102628, "auxiliary_loss_mlp": 0.01015101, "balance_loss_clip": 1.02508974, "balance_loss_mlp": 1.01382565, "epoch": 0.3866526379077108, "flos": 64165726978560.0, "grad_norm": 0.719429045650031, "language_loss": 0.58803207, "learning_rate": 2.807574793260416e-06, "loss": 0.60844588, "num_input_tokens_seen": 138098090, "step": 6431, "time_per_iteration": 3.2772469520568848 }, { "auxiliary_loss_clip": 0.01086398, "auxiliary_loss_mlp": 0.01039293, "balance_loss_clip": 1.04541588, "balance_loss_mlp": 1.02296114, "epoch": 0.3867127611603788, "flos": 14388292897920.0, "grad_norm": 2.1660589654497424, "language_loss": 0.79041815, "learning_rate": 2.8072184778215004e-06, "loss": 0.81167507, "num_input_tokens_seen": 138114735, "step": 6432, "time_per_iteration": 2.7949061393737793 }, { "auxiliary_loss_clip": 0.01125593, "auxiliary_loss_mlp": 0.01048624, "balance_loss_clip": 1.04708362, "balance_loss_mlp": 1.03231645, "epoch": 0.38677288441304675, "flos": 20010754823040.0, "grad_norm": 2.0695366497364294, "language_loss": 0.80186564, "learning_rate": 2.806862131772779e-06, "loss": 0.82360786, "num_input_tokens_seen": 138130480, "step": 6433, "time_per_iteration": 2.6526312828063965 }, { "auxiliary_loss_clip": 0.01111087, "auxiliary_loss_mlp": 0.01037922, "balance_loss_clip": 1.04934025, "balance_loss_mlp": 1.02162611, "epoch": 0.3868330076657147, "flos": 22236893147520.0, "grad_norm": 1.6267030007711512, "language_loss": 0.70441496, "learning_rate": 2.806505755127765e-06, "loss": 0.72590506, "num_input_tokens_seen": 138150640, "step": 6434, "time_per_iteration": 2.6985394954681396 }, { "auxiliary_loss_clip": 0.01097728, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.04536152, "balance_loss_mlp": 1.03008235, "epoch": 0.3868931309183827, "flos": 16727442387840.0, "grad_norm": 1.7348790517482282, "language_loss": 0.77462173, "learning_rate": 2.806149347899972e-06, "loss": 0.79607308, "num_input_tokens_seen": 138169700, "step": 6435, "time_per_iteration": 2.7326719760894775 }, { "auxiliary_loss_clip": 0.01119609, "auxiliary_loss_mlp": 0.01035834, "balance_loss_clip": 1.04651809, "balance_loss_mlp": 1.0208497, "epoch": 0.38695325417105064, "flos": 22674716023680.0, "grad_norm": 2.3575842278582813, "language_loss": 0.79599082, "learning_rate": 2.805792910102915e-06, "loss": 0.81754529, "num_input_tokens_seen": 138185835, "step": 6436, "time_per_iteration": 2.6643154621124268 }, { "auxiliary_loss_clip": 0.01107099, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.04809546, "balance_loss_mlp": 1.0215621, "epoch": 0.3870133774237186, "flos": 23112036109440.0, "grad_norm": 1.9038851888933561, "language_loss": 0.76043606, "learning_rate": 2.8054364417501093e-06, "loss": 0.78187203, "num_input_tokens_seen": 138204080, "step": 6437, "time_per_iteration": 2.701834201812744 }, { "auxiliary_loss_clip": 0.01110073, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.04696321, "balance_loss_mlp": 1.02374589, "epoch": 0.3870735006763866, "flos": 17675699483520.0, "grad_norm": 2.022501790448194, "language_loss": 0.81817484, "learning_rate": 2.805079942855074e-06, "loss": 0.8396467, "num_input_tokens_seen": 138220710, "step": 6438, "time_per_iteration": 4.327820539474487 }, { "auxiliary_loss_clip": 0.01111326, "auxiliary_loss_mlp": 0.0077319, "balance_loss_clip": 1.04504764, "balance_loss_mlp": 1.00027561, "epoch": 0.38713362392905454, "flos": 23295791111040.0, "grad_norm": 1.7517226143139228, "language_loss": 0.75388491, "learning_rate": 2.804723413431326e-06, "loss": 0.77273011, "num_input_tokens_seen": 138241720, "step": 6439, "time_per_iteration": 2.797830104827881 }, { "auxiliary_loss_clip": 0.01131277, "auxiliary_loss_mlp": 0.01037901, "balance_loss_clip": 1.04915833, "balance_loss_mlp": 1.0235002, "epoch": 0.38719374718172256, "flos": 21031192298880.0, "grad_norm": 1.7565856090832077, "language_loss": 0.74071443, "learning_rate": 2.8043668534923855e-06, "loss": 0.76240611, "num_input_tokens_seen": 138261885, "step": 6440, "time_per_iteration": 4.2160422801971436 }, { "auxiliary_loss_clip": 0.01125111, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.04927301, "balance_loss_mlp": 1.01949763, "epoch": 0.3872538704343905, "flos": 19609776322560.0, "grad_norm": 2.101028456947384, "language_loss": 0.82017142, "learning_rate": 2.804010263051774e-06, "loss": 0.84177244, "num_input_tokens_seen": 138280255, "step": 6441, "time_per_iteration": 4.199851036071777 }, { "auxiliary_loss_clip": 0.0113476, "auxiliary_loss_mlp": 0.01039285, "balance_loss_clip": 1.05011272, "balance_loss_mlp": 1.02490842, "epoch": 0.3873139936870585, "flos": 17530045833600.0, "grad_norm": 2.8802239922493147, "language_loss": 0.80824792, "learning_rate": 2.8036536421230118e-06, "loss": 0.82998842, "num_input_tokens_seen": 138296675, "step": 6442, "time_per_iteration": 2.6942524909973145 }, { "auxiliary_loss_clip": 0.01090073, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.04431343, "balance_loss_mlp": 1.01747537, "epoch": 0.38737411693972645, "flos": 17786555832960.0, "grad_norm": 2.1593394156288044, "language_loss": 0.84054118, "learning_rate": 2.803296990719624e-06, "loss": 0.86176467, "num_input_tokens_seen": 138314985, "step": 6443, "time_per_iteration": 2.6660094261169434 }, { "auxiliary_loss_clip": 0.01033878, "auxiliary_loss_mlp": 0.01000185, "balance_loss_clip": 1.02513885, "balance_loss_mlp": 0.99879646, "epoch": 0.3874342401923944, "flos": 58304637048960.0, "grad_norm": 0.7605185654135588, "language_loss": 0.50208193, "learning_rate": 2.8029403088551327e-06, "loss": 0.52242255, "num_input_tokens_seen": 138373275, "step": 6444, "time_per_iteration": 4.807433128356934 }, { "auxiliary_loss_clip": 0.01086333, "auxiliary_loss_mlp": 0.00773648, "balance_loss_clip": 1.04187298, "balance_loss_mlp": 1.00033963, "epoch": 0.3874943634450624, "flos": 17711933328000.0, "grad_norm": 1.4666177781563792, "language_loss": 0.78874767, "learning_rate": 2.802583596543065e-06, "loss": 0.80734754, "num_input_tokens_seen": 138391145, "step": 6445, "time_per_iteration": 2.689142942428589 }, { "auxiliary_loss_clip": 0.0111426, "auxiliary_loss_mlp": 0.0103959, "balance_loss_clip": 1.04754841, "balance_loss_mlp": 1.02445602, "epoch": 0.38755448669773035, "flos": 19244852098560.0, "grad_norm": 2.4274750437973958, "language_loss": 0.81207073, "learning_rate": 2.8022268537969474e-06, "loss": 0.83360916, "num_input_tokens_seen": 138409875, "step": 6446, "time_per_iteration": 2.6582860946655273 }, { "auxiliary_loss_clip": 0.01107394, "auxiliary_loss_mlp": 0.01037275, "balance_loss_clip": 1.04530001, "balance_loss_mlp": 1.02277923, "epoch": 0.3876146099503983, "flos": 20594267262720.0, "grad_norm": 3.0137556994939887, "language_loss": 0.77366996, "learning_rate": 2.801870080630306e-06, "loss": 0.79511666, "num_input_tokens_seen": 138428965, "step": 6447, "time_per_iteration": 2.727285146713257 }, { "auxiliary_loss_clip": 0.01108854, "auxiliary_loss_mlp": 0.01037807, "balance_loss_clip": 1.04590762, "balance_loss_mlp": 1.02378821, "epoch": 0.3876747332030663, "flos": 19281121856640.0, "grad_norm": 2.4450461903172562, "language_loss": 0.76364803, "learning_rate": 2.801513277056671e-06, "loss": 0.78511459, "num_input_tokens_seen": 138448090, "step": 6448, "time_per_iteration": 2.663989543914795 }, { "auxiliary_loss_clip": 0.01102873, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.04449654, "balance_loss_mlp": 1.02322626, "epoch": 0.38773485645573424, "flos": 18945895201920.0, "grad_norm": 1.6490971101368535, "language_loss": 0.76146352, "learning_rate": 2.8011564430895725e-06, "loss": 0.7828809, "num_input_tokens_seen": 138466105, "step": 6449, "time_per_iteration": 2.806537628173828 }, { "auxiliary_loss_clip": 0.01098531, "auxiliary_loss_mlp": 0.00772575, "balance_loss_clip": 1.04406381, "balance_loss_mlp": 1.00027394, "epoch": 0.3877949797084022, "flos": 23071348978560.0, "grad_norm": 2.0995234377866985, "language_loss": 0.78572172, "learning_rate": 2.800799578742542e-06, "loss": 0.80443275, "num_input_tokens_seen": 138485160, "step": 6450, "time_per_iteration": 2.7541351318359375 }, { "auxiliary_loss_clip": 0.01137663, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.04827702, "balance_loss_mlp": 1.02452803, "epoch": 0.3878551029610702, "flos": 29095543589760.0, "grad_norm": 2.5655640440870946, "language_loss": 0.78046334, "learning_rate": 2.8004426840291106e-06, "loss": 0.80223942, "num_input_tokens_seen": 138504135, "step": 6451, "time_per_iteration": 2.6868700981140137 }, { "auxiliary_loss_clip": 0.01126689, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.04576159, "balance_loss_mlp": 1.01696229, "epoch": 0.38791522621373814, "flos": 20996394998400.0, "grad_norm": 2.633183178462793, "language_loss": 0.76404589, "learning_rate": 2.800085758962812e-06, "loss": 0.78562915, "num_input_tokens_seen": 138523955, "step": 6452, "time_per_iteration": 2.708750009536743 }, { "auxiliary_loss_clip": 0.01103834, "auxiliary_loss_mlp": 0.01042785, "balance_loss_clip": 1.04665875, "balance_loss_mlp": 1.0285815, "epoch": 0.3879753494664061, "flos": 15486836497920.0, "grad_norm": 1.5878969811553463, "language_loss": 0.79534453, "learning_rate": 2.799728803557182e-06, "loss": 0.81681073, "num_input_tokens_seen": 138541655, "step": 6453, "time_per_iteration": 2.7226593494415283 }, { "auxiliary_loss_clip": 0.0112782, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.04889584, "balance_loss_mlp": 1.02560616, "epoch": 0.3880354727190741, "flos": 22053964158720.0, "grad_norm": 19.957823861770734, "language_loss": 0.71643323, "learning_rate": 2.7993718178257555e-06, "loss": 0.73812103, "num_input_tokens_seen": 138560860, "step": 6454, "time_per_iteration": 2.7265548706054688 }, { "auxiliary_loss_clip": 0.01137183, "auxiliary_loss_mlp": 0.01043076, "balance_loss_clip": 1.04976404, "balance_loss_mlp": 1.02693522, "epoch": 0.3880955959717421, "flos": 20340307128960.0, "grad_norm": 2.029110970619929, "language_loss": 0.77489239, "learning_rate": 2.7990148017820694e-06, "loss": 0.79669499, "num_input_tokens_seen": 138580200, "step": 6455, "time_per_iteration": 2.7688205242156982 }, { "auxiliary_loss_clip": 0.01131496, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.04975748, "balance_loss_mlp": 1.02897501, "epoch": 0.38815571922441006, "flos": 23075407215360.0, "grad_norm": 1.8133016626985128, "language_loss": 0.76193333, "learning_rate": 2.798657755439662e-06, "loss": 0.78368604, "num_input_tokens_seen": 138598315, "step": 6456, "time_per_iteration": 2.6894283294677734 }, { "auxiliary_loss_clip": 0.01059894, "auxiliary_loss_mlp": 0.01038862, "balance_loss_clip": 1.04251969, "balance_loss_mlp": 1.02365136, "epoch": 0.388215842477078, "flos": 20776944856320.0, "grad_norm": 9.416859416659493, "language_loss": 0.59422505, "learning_rate": 2.7983006788120726e-06, "loss": 0.61521268, "num_input_tokens_seen": 138615695, "step": 6457, "time_per_iteration": 2.8189444541931152 }, { "auxiliary_loss_clip": 0.01136561, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.04989612, "balance_loss_mlp": 1.02262187, "epoch": 0.388275965729746, "flos": 20448182649600.0, "grad_norm": 2.336997181985419, "language_loss": 0.79927063, "learning_rate": 2.797943571912841e-06, "loss": 0.82102776, "num_input_tokens_seen": 138633180, "step": 6458, "time_per_iteration": 2.66198992729187 }, { "auxiliary_loss_clip": 0.01081764, "auxiliary_loss_mlp": 0.0104529, "balance_loss_clip": 1.04428816, "balance_loss_mlp": 1.02855277, "epoch": 0.38833608898241395, "flos": 27892392606720.0, "grad_norm": 2.218973373394608, "language_loss": 0.81497735, "learning_rate": 2.797586434755509e-06, "loss": 0.83624792, "num_input_tokens_seen": 138654785, "step": 6459, "time_per_iteration": 2.780120611190796 }, { "auxiliary_loss_clip": 0.01105714, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.04633725, "balance_loss_mlp": 1.0236907, "epoch": 0.3883962122350819, "flos": 18076390675200.0, "grad_norm": 1.942341955564712, "language_loss": 0.62001127, "learning_rate": 2.7972292673536202e-06, "loss": 0.64144087, "num_input_tokens_seen": 138673330, "step": 6460, "time_per_iteration": 2.625399112701416 }, { "auxiliary_loss_clip": 0.01120569, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.04955411, "balance_loss_mlp": 1.01920033, "epoch": 0.3884563354877499, "flos": 23622254847360.0, "grad_norm": 1.928823237011181, "language_loss": 0.86226058, "learning_rate": 2.796872069720717e-06, "loss": 0.88379019, "num_input_tokens_seen": 138694185, "step": 6461, "time_per_iteration": 2.6901583671569824 }, { "auxiliary_loss_clip": 0.0111976, "auxiliary_loss_mlp": 0.01038779, "balance_loss_clip": 1.04810238, "balance_loss_mlp": 1.0244205, "epoch": 0.38851645874041785, "flos": 27453528236160.0, "grad_norm": 4.963760229824091, "language_loss": 0.70659202, "learning_rate": 2.7965148418703456e-06, "loss": 0.72817743, "num_input_tokens_seen": 138714625, "step": 6462, "time_per_iteration": 2.7463371753692627 }, { "auxiliary_loss_clip": 0.01086013, "auxiliary_loss_mlp": 0.01043745, "balance_loss_clip": 1.04045033, "balance_loss_mlp": 1.02786636, "epoch": 0.3885765819930858, "flos": 25228072270080.0, "grad_norm": 2.747031306466439, "language_loss": 0.76228201, "learning_rate": 2.796157583816052e-06, "loss": 0.78357965, "num_input_tokens_seen": 138733585, "step": 6463, "time_per_iteration": 2.7231578826904297 }, { "auxiliary_loss_clip": 0.01103201, "auxiliary_loss_mlp": 0.0104459, "balance_loss_clip": 1.05013013, "balance_loss_mlp": 1.02841353, "epoch": 0.3886367052457538, "flos": 16946605221120.0, "grad_norm": 3.605418601568306, "language_loss": 0.70244539, "learning_rate": 2.795800295571382e-06, "loss": 0.72392333, "num_input_tokens_seen": 138752335, "step": 6464, "time_per_iteration": 2.773066759109497 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.04950786, "balance_loss_mlp": 1.02211452, "epoch": 0.38869682849842174, "flos": 27154140376320.0, "grad_norm": 2.8184770761764777, "language_loss": 0.69632983, "learning_rate": 2.7954429771498858e-06, "loss": 0.71780872, "num_input_tokens_seen": 138768450, "step": 6465, "time_per_iteration": 2.7013487815856934 }, { "auxiliary_loss_clip": 0.01097351, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.04837847, "balance_loss_mlp": 1.02645373, "epoch": 0.3887569517510897, "flos": 21063619301760.0, "grad_norm": 2.665243237814177, "language_loss": 0.78489739, "learning_rate": 2.7950856285651117e-06, "loss": 0.80628836, "num_input_tokens_seen": 138786775, "step": 6466, "time_per_iteration": 2.736819267272949 }, { "auxiliary_loss_clip": 0.01095374, "auxiliary_loss_mlp": 0.01037568, "balance_loss_clip": 1.0463171, "balance_loss_mlp": 1.02242851, "epoch": 0.38881707500375773, "flos": 29497384016640.0, "grad_norm": 1.6522613533538497, "language_loss": 0.69341898, "learning_rate": 2.794728249830611e-06, "loss": 0.71474838, "num_input_tokens_seen": 138810100, "step": 6467, "time_per_iteration": 2.778083324432373 }, { "auxiliary_loss_clip": 0.01098114, "auxiliary_loss_mlp": 0.01048152, "balance_loss_clip": 1.04706931, "balance_loss_mlp": 1.0326246, "epoch": 0.3888771982564257, "flos": 17488281294720.0, "grad_norm": 3.2276382920067817, "language_loss": 0.84199375, "learning_rate": 2.794370840959936e-06, "loss": 0.86345637, "num_input_tokens_seen": 138825140, "step": 6468, "time_per_iteration": 2.6842098236083984 }, { "auxiliary_loss_clip": 0.01108569, "auxiliary_loss_mlp": 0.01036235, "balance_loss_clip": 1.048172, "balance_loss_mlp": 1.0227766, "epoch": 0.38893732150909366, "flos": 21942425450880.0, "grad_norm": 1.8219377355536144, "language_loss": 0.84232908, "learning_rate": 2.7940134019666383e-06, "loss": 0.86377716, "num_input_tokens_seen": 138844115, "step": 6469, "time_per_iteration": 2.7538135051727295 }, { "auxiliary_loss_clip": 0.0109067, "auxiliary_loss_mlp": 0.01048288, "balance_loss_clip": 1.04416847, "balance_loss_mlp": 1.03205132, "epoch": 0.3889974447617616, "flos": 24276367468800.0, "grad_norm": 2.339210402911935, "language_loss": 0.75173676, "learning_rate": 2.793655932864273e-06, "loss": 0.7731263, "num_input_tokens_seen": 138860860, "step": 6470, "time_per_iteration": 2.7425949573516846 }, { "auxiliary_loss_clip": 0.01095528, "auxiliary_loss_mlp": 0.00772188, "balance_loss_clip": 1.0480423, "balance_loss_mlp": 1.00016475, "epoch": 0.3890575680144296, "flos": 25667116208640.0, "grad_norm": 1.5943716760052937, "language_loss": 0.74977577, "learning_rate": 2.7932984336663953e-06, "loss": 0.76845288, "num_input_tokens_seen": 138881910, "step": 6471, "time_per_iteration": 2.8880369663238525 }, { "auxiliary_loss_clip": 0.01077518, "auxiliary_loss_mlp": 0.01049277, "balance_loss_clip": 1.03879571, "balance_loss_mlp": 1.03336215, "epoch": 0.38911769126709755, "flos": 22855274714880.0, "grad_norm": 2.421548050110463, "language_loss": 0.67984551, "learning_rate": 2.792940904386562e-06, "loss": 0.70111346, "num_input_tokens_seen": 138900975, "step": 6472, "time_per_iteration": 2.7776875495910645 }, { "auxiliary_loss_clip": 0.01103596, "auxiliary_loss_mlp": 0.01043152, "balance_loss_clip": 1.04819107, "balance_loss_mlp": 1.02974129, "epoch": 0.3891778145197655, "flos": 25447522412160.0, "grad_norm": 1.8102352941433608, "language_loss": 0.76068687, "learning_rate": 2.7925833450383293e-06, "loss": 0.78215432, "num_input_tokens_seen": 138920795, "step": 6473, "time_per_iteration": 2.7568469047546387 }, { "auxiliary_loss_clip": 0.01113975, "auxiliary_loss_mlp": 0.01046096, "balance_loss_clip": 1.05217087, "balance_loss_mlp": 1.03031242, "epoch": 0.3892379377724335, "flos": 14027965614720.0, "grad_norm": 2.045216735434868, "language_loss": 0.70959115, "learning_rate": 2.792225755635257e-06, "loss": 0.73119187, "num_input_tokens_seen": 138938770, "step": 6474, "time_per_iteration": 2.6930696964263916 }, { "auxiliary_loss_clip": 0.01135028, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.05145836, "balance_loss_mlp": 1.02861369, "epoch": 0.38929806102510145, "flos": 20157449967360.0, "grad_norm": 1.5519949793695216, "language_loss": 0.69049072, "learning_rate": 2.7918681361909046e-06, "loss": 0.71226156, "num_input_tokens_seen": 138958880, "step": 6475, "time_per_iteration": 2.670830011367798 }, { "auxiliary_loss_clip": 0.01110637, "auxiliary_loss_mlp": 0.01057592, "balance_loss_clip": 1.04578567, "balance_loss_mlp": 1.03981757, "epoch": 0.3893581842777694, "flos": 22163958581760.0, "grad_norm": 1.9596553320764234, "language_loss": 0.75820196, "learning_rate": 2.7915104867188332e-06, "loss": 0.77988434, "num_input_tokens_seen": 138977240, "step": 6476, "time_per_iteration": 2.683980941772461 }, { "auxiliary_loss_clip": 0.01039888, "auxiliary_loss_mlp": 0.01002183, "balance_loss_clip": 1.02862918, "balance_loss_mlp": 1.00084782, "epoch": 0.3894183075304374, "flos": 67301877392640.0, "grad_norm": 0.7759740468574157, "language_loss": 0.58146399, "learning_rate": 2.7911528072326055e-06, "loss": 0.60188472, "num_input_tokens_seen": 139039035, "step": 6477, "time_per_iteration": 3.2430496215820312 }, { "auxiliary_loss_clip": 0.01092497, "auxiliary_loss_mlp": 0.01040603, "balance_loss_clip": 1.04780793, "balance_loss_mlp": 1.02428961, "epoch": 0.38947843078310534, "flos": 18547502480640.0, "grad_norm": 1.9073891309950948, "language_loss": 0.78554142, "learning_rate": 2.7907950977457832e-06, "loss": 0.80687243, "num_input_tokens_seen": 139055560, "step": 6478, "time_per_iteration": 4.241156339645386 }, { "auxiliary_loss_clip": 0.01116081, "auxiliary_loss_mlp": 0.0103975, "balance_loss_clip": 1.04505491, "balance_loss_mlp": 1.02545047, "epoch": 0.3895385540357733, "flos": 14605875532800.0, "grad_norm": 2.6992371438810783, "language_loss": 0.82647753, "learning_rate": 2.7904373582719317e-06, "loss": 0.84803581, "num_input_tokens_seen": 139071865, "step": 6479, "time_per_iteration": 4.1569294929504395 }, { "auxiliary_loss_clip": 0.01131381, "auxiliary_loss_mlp": 0.01036344, "balance_loss_clip": 1.04886651, "balance_loss_mlp": 1.02161551, "epoch": 0.38959867728844133, "flos": 19975203336960.0, "grad_norm": 2.334048099077096, "language_loss": 0.79657412, "learning_rate": 2.790079588824617e-06, "loss": 0.81825137, "num_input_tokens_seen": 139089640, "step": 6480, "time_per_iteration": 4.170635938644409 }, { "auxiliary_loss_clip": 0.0110471, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.04561472, "balance_loss_mlp": 1.01822066, "epoch": 0.3896588005411093, "flos": 22672130244480.0, "grad_norm": 6.364109786330533, "language_loss": 0.83021134, "learning_rate": 2.7897217894174038e-06, "loss": 0.85158312, "num_input_tokens_seen": 139109365, "step": 6481, "time_per_iteration": 2.638821840286255 }, { "auxiliary_loss_clip": 0.01102815, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.04740214, "balance_loss_mlp": 1.02503228, "epoch": 0.38971892379377726, "flos": 20996035862400.0, "grad_norm": 1.7002276765936415, "language_loss": 0.75389051, "learning_rate": 2.789363960063863e-06, "loss": 0.77530706, "num_input_tokens_seen": 139128260, "step": 6482, "time_per_iteration": 2.5737624168395996 }, { "auxiliary_loss_clip": 0.01100553, "auxiliary_loss_mlp": 0.01035815, "balance_loss_clip": 1.04781246, "balance_loss_mlp": 1.02164662, "epoch": 0.3897790470464452, "flos": 22528487756160.0, "grad_norm": 2.0703094316503554, "language_loss": 0.78786725, "learning_rate": 2.78900610077756e-06, "loss": 0.80923092, "num_input_tokens_seen": 139147315, "step": 6483, "time_per_iteration": 2.6177117824554443 }, { "auxiliary_loss_clip": 0.01121516, "auxiliary_loss_mlp": 0.01030702, "balance_loss_clip": 1.04790664, "balance_loss_mlp": 1.01487088, "epoch": 0.3898391702991132, "flos": 26209905603840.0, "grad_norm": 1.6677367088018817, "language_loss": 0.79871929, "learning_rate": 2.788648211572067e-06, "loss": 0.82024151, "num_input_tokens_seen": 139167270, "step": 6484, "time_per_iteration": 4.221461534500122 }, { "auxiliary_loss_clip": 0.01119394, "auxiliary_loss_mlp": 0.01051487, "balance_loss_clip": 1.05063844, "balance_loss_mlp": 1.03472662, "epoch": 0.38989929355178116, "flos": 21065558636160.0, "grad_norm": 2.1008000508061104, "language_loss": 0.77901775, "learning_rate": 2.7882902924609557e-06, "loss": 0.80072653, "num_input_tokens_seen": 139185970, "step": 6485, "time_per_iteration": 2.664097785949707 }, { "auxiliary_loss_clip": 0.01085813, "auxiliary_loss_mlp": 0.01036912, "balance_loss_clip": 1.0427084, "balance_loss_mlp": 1.02207613, "epoch": 0.3899594168044491, "flos": 25484115392640.0, "grad_norm": 6.223818029706007, "language_loss": 0.85190272, "learning_rate": 2.7879323434577965e-06, "loss": 0.87312996, "num_input_tokens_seen": 139203730, "step": 6486, "time_per_iteration": 2.8325467109680176 }, { "auxiliary_loss_clip": 0.01111569, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.04786611, "balance_loss_mlp": 1.01883638, "epoch": 0.3900195400571171, "flos": 31139363456640.0, "grad_norm": 2.4250185390770618, "language_loss": 0.85333234, "learning_rate": 2.7875743645761645e-06, "loss": 0.87478197, "num_input_tokens_seen": 139222560, "step": 6487, "time_per_iteration": 2.8390486240386963 }, { "auxiliary_loss_clip": 0.01103222, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.04449213, "balance_loss_mlp": 1.01793766, "epoch": 0.39007966330978505, "flos": 20229917656320.0, "grad_norm": 1.5390409302603854, "language_loss": 0.72954559, "learning_rate": 2.787216355829633e-06, "loss": 0.75090778, "num_input_tokens_seen": 139242165, "step": 6488, "time_per_iteration": 2.7613236904144287 }, { "auxiliary_loss_clip": 0.01096805, "auxiliary_loss_mlp": 0.01044873, "balance_loss_clip": 1.04673266, "balance_loss_mlp": 1.02771914, "epoch": 0.390139786562453, "flos": 22528739151360.0, "grad_norm": 2.6420160637986383, "language_loss": 0.68467176, "learning_rate": 2.786858317231779e-06, "loss": 0.70608854, "num_input_tokens_seen": 139262525, "step": 6489, "time_per_iteration": 2.746307849884033 }, { "auxiliary_loss_clip": 0.01108111, "auxiliary_loss_mlp": 0.01041602, "balance_loss_clip": 1.04793715, "balance_loss_mlp": 1.02673674, "epoch": 0.390199909815121, "flos": 26432911192320.0, "grad_norm": 1.6912118236512272, "language_loss": 0.80629271, "learning_rate": 2.7865002487961788e-06, "loss": 0.82778984, "num_input_tokens_seen": 139282835, "step": 6490, "time_per_iteration": 2.7116847038269043 }, { "auxiliary_loss_clip": 0.01124963, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.04856181, "balance_loss_mlp": 1.0187161, "epoch": 0.39026003306778895, "flos": 17274577328640.0, "grad_norm": 3.073568327903315, "language_loss": 0.89115125, "learning_rate": 2.7861421505364104e-06, "loss": 0.91273135, "num_input_tokens_seen": 139299490, "step": 6491, "time_per_iteration": 2.6211190223693848 }, { "auxiliary_loss_clip": 0.01092029, "auxiliary_loss_mlp": 0.01045074, "balance_loss_clip": 1.04406416, "balance_loss_mlp": 1.02952874, "epoch": 0.3903201563204569, "flos": 24532841554560.0, "grad_norm": 1.8064559635296407, "language_loss": 0.78637981, "learning_rate": 2.7857840224660523e-06, "loss": 0.80775088, "num_input_tokens_seen": 139317865, "step": 6492, "time_per_iteration": 2.7505667209625244 }, { "auxiliary_loss_clip": 0.01108778, "auxiliary_loss_mlp": 0.01041967, "balance_loss_clip": 1.04486537, "balance_loss_mlp": 1.02735257, "epoch": 0.39038027957312493, "flos": 23767944410880.0, "grad_norm": 1.7227367696506604, "language_loss": 0.74431908, "learning_rate": 2.7854258645986857e-06, "loss": 0.76582652, "num_input_tokens_seen": 139339840, "step": 6493, "time_per_iteration": 2.7200233936309814 }, { "auxiliary_loss_clip": 0.01091358, "auxiliary_loss_mlp": 0.01040258, "balance_loss_clip": 1.04613161, "balance_loss_mlp": 1.02549398, "epoch": 0.3904404028257929, "flos": 14100612871680.0, "grad_norm": 2.9656676182999395, "language_loss": 0.7637316, "learning_rate": 2.7850676769478916e-06, "loss": 0.78504777, "num_input_tokens_seen": 139357555, "step": 6494, "time_per_iteration": 2.6818442344665527 }, { "auxiliary_loss_clip": 0.01131498, "auxiliary_loss_mlp": 0.01048378, "balance_loss_clip": 1.0500524, "balance_loss_mlp": 1.03182006, "epoch": 0.39050052607846086, "flos": 16910048154240.0, "grad_norm": 2.1152980497113782, "language_loss": 0.74208486, "learning_rate": 2.7847094595272525e-06, "loss": 0.76388359, "num_input_tokens_seen": 139374455, "step": 6495, "time_per_iteration": 2.6432337760925293 }, { "auxiliary_loss_clip": 0.01137243, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.05153751, "balance_loss_mlp": 1.02913451, "epoch": 0.39056064933112883, "flos": 25915761129600.0, "grad_norm": 2.402575660392066, "language_loss": 0.67757058, "learning_rate": 2.784351212350352e-06, "loss": 0.69939756, "num_input_tokens_seen": 139394770, "step": 6496, "time_per_iteration": 2.762009859085083 }, { "auxiliary_loss_clip": 0.01023856, "auxiliary_loss_mlp": 0.01010625, "balance_loss_clip": 1.02393842, "balance_loss_mlp": 1.00925446, "epoch": 0.3906207725837968, "flos": 60028421713920.0, "grad_norm": 0.6655460592599327, "language_loss": 0.53920811, "learning_rate": 2.783992935430775e-06, "loss": 0.55955297, "num_input_tokens_seen": 139454760, "step": 6497, "time_per_iteration": 3.351006507873535 }, { "auxiliary_loss_clip": 0.01094838, "auxiliary_loss_mlp": 0.00772151, "balance_loss_clip": 1.0476501, "balance_loss_mlp": 1.00038421, "epoch": 0.39068089583646476, "flos": 21068683119360.0, "grad_norm": 2.7558428999232847, "language_loss": 0.6865977, "learning_rate": 2.7836346287821068e-06, "loss": 0.70526755, "num_input_tokens_seen": 139472645, "step": 6498, "time_per_iteration": 2.7838692665100098 }, { "auxiliary_loss_clip": 0.01022021, "auxiliary_loss_mlp": 0.01009741, "balance_loss_clip": 1.02064919, "balance_loss_mlp": 1.00839996, "epoch": 0.3907410190891327, "flos": 70445677403520.0, "grad_norm": 0.7248596102007157, "language_loss": 0.51767612, "learning_rate": 2.783276292417936e-06, "loss": 0.53799379, "num_input_tokens_seen": 139536730, "step": 6499, "time_per_iteration": 3.2980377674102783 }, { "auxiliary_loss_clip": 0.01122618, "auxiliary_loss_mlp": 0.01044387, "balance_loss_clip": 1.04676056, "balance_loss_mlp": 1.02793658, "epoch": 0.3908011423418007, "flos": 27962454084480.0, "grad_norm": 1.973185164339423, "language_loss": 0.73842579, "learning_rate": 2.7829179263518487e-06, "loss": 0.76009583, "num_input_tokens_seen": 139557540, "step": 6500, "time_per_iteration": 2.7239198684692383 }, { "auxiliary_loss_clip": 0.01125366, "auxiliary_loss_mlp": 0.01037256, "balance_loss_clip": 1.05035591, "balance_loss_mlp": 1.02246249, "epoch": 0.39086126559446865, "flos": 24462097718400.0, "grad_norm": 2.6021512056662814, "language_loss": 0.68837166, "learning_rate": 2.7825595305974354e-06, "loss": 0.70999795, "num_input_tokens_seen": 139576875, "step": 6501, "time_per_iteration": 2.6926429271698 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.0103859, "balance_loss_clip": 1.04832482, "balance_loss_mlp": 1.02442181, "epoch": 0.3909213888471366, "flos": 16941541403520.0, "grad_norm": 2.1384909443348246, "language_loss": 0.78875881, "learning_rate": 2.782201105168287e-06, "loss": 0.8103711, "num_input_tokens_seen": 139594295, "step": 6502, "time_per_iteration": 2.647021770477295 }, { "auxiliary_loss_clip": 0.01109811, "auxiliary_loss_mlp": 0.01035328, "balance_loss_clip": 1.04876852, "balance_loss_mlp": 1.02171457, "epoch": 0.3909815120998046, "flos": 29278400751360.0, "grad_norm": 3.671996146003432, "language_loss": 0.80537987, "learning_rate": 2.7818426500779932e-06, "loss": 0.82683128, "num_input_tokens_seen": 139614080, "step": 6503, "time_per_iteration": 2.7318384647369385 }, { "auxiliary_loss_clip": 0.0110371, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.04387689, "balance_loss_mlp": 1.01760423, "epoch": 0.39104163535247255, "flos": 18951246328320.0, "grad_norm": 1.848076786389183, "language_loss": 0.71439689, "learning_rate": 2.7814841653401485e-06, "loss": 0.7357465, "num_input_tokens_seen": 139632755, "step": 6504, "time_per_iteration": 2.6983554363250732 }, { "auxiliary_loss_clip": 0.01130195, "auxiliary_loss_mlp": 0.01034576, "balance_loss_clip": 1.0459981, "balance_loss_mlp": 1.0199374, "epoch": 0.3911017586051405, "flos": 26323347732480.0, "grad_norm": 1.4848516480735832, "language_loss": 0.83245611, "learning_rate": 2.7811256509683454e-06, "loss": 0.8541038, "num_input_tokens_seen": 139654205, "step": 6505, "time_per_iteration": 2.6663267612457275 }, { "auxiliary_loss_clip": 0.01131259, "auxiliary_loss_mlp": 0.01036964, "balance_loss_clip": 1.04880178, "balance_loss_mlp": 1.02123427, "epoch": 0.3911618818578085, "flos": 21835770992640.0, "grad_norm": 1.9330872564568533, "language_loss": 0.71352887, "learning_rate": 2.7807671069761797e-06, "loss": 0.73521107, "num_input_tokens_seen": 139673595, "step": 6506, "time_per_iteration": 2.6168534755706787 }, { "auxiliary_loss_clip": 0.01105925, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.04536867, "balance_loss_mlp": 1.02267289, "epoch": 0.3912220051104765, "flos": 16359680989440.0, "grad_norm": 2.106647299507305, "language_loss": 0.75086504, "learning_rate": 2.7804085333772477e-06, "loss": 0.77228636, "num_input_tokens_seen": 139690565, "step": 6507, "time_per_iteration": 2.8207101821899414 }, { "auxiliary_loss_clip": 0.01053146, "auxiliary_loss_mlp": 0.01002126, "balance_loss_clip": 1.02403712, "balance_loss_mlp": 1.00068331, "epoch": 0.39128212836314447, "flos": 71050986420480.0, "grad_norm": 0.9386901837185221, "language_loss": 0.56488812, "learning_rate": 2.7800499301851446e-06, "loss": 0.58544087, "num_input_tokens_seen": 139749420, "step": 6508, "time_per_iteration": 3.3985793590545654 }, { "auxiliary_loss_clip": 0.01121659, "auxiliary_loss_mlp": 0.01038464, "balance_loss_clip": 1.05045915, "balance_loss_mlp": 1.02476096, "epoch": 0.39134225161581243, "flos": 20331975173760.0, "grad_norm": 2.0207920703954936, "language_loss": 0.76855135, "learning_rate": 2.779691297413471e-06, "loss": 0.79015261, "num_input_tokens_seen": 139766265, "step": 6509, "time_per_iteration": 2.6667048931121826 }, { "auxiliary_loss_clip": 0.01101334, "auxiliary_loss_mlp": 0.01043985, "balance_loss_clip": 1.04298568, "balance_loss_mlp": 1.02731967, "epoch": 0.3914023748684804, "flos": 17018390551680.0, "grad_norm": 5.905968065437354, "language_loss": 0.82739937, "learning_rate": 2.779332635075825e-06, "loss": 0.84885252, "num_input_tokens_seen": 139782400, "step": 6510, "time_per_iteration": 2.933931589126587 }, { "auxiliary_loss_clip": 0.0112259, "auxiliary_loss_mlp": 0.01038677, "balance_loss_clip": 1.04712081, "balance_loss_mlp": 1.02406788, "epoch": 0.39146249812114836, "flos": 18405224709120.0, "grad_norm": 5.781106582003857, "language_loss": 0.76999253, "learning_rate": 2.7789739431858073e-06, "loss": 0.79160517, "num_input_tokens_seen": 139801435, "step": 6511, "time_per_iteration": 2.6926233768463135 }, { "auxiliary_loss_clip": 0.01035867, "auxiliary_loss_mlp": 0.01006458, "balance_loss_clip": 1.02583003, "balance_loss_mlp": 1.00515223, "epoch": 0.3915226213738163, "flos": 67637355442560.0, "grad_norm": 0.716551875912138, "language_loss": 0.57749176, "learning_rate": 2.7786152217570196e-06, "loss": 0.59791505, "num_input_tokens_seen": 139869700, "step": 6512, "time_per_iteration": 3.3695731163024902 }, { "auxiliary_loss_clip": 0.01135844, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.05013657, "balance_loss_mlp": 1.02001858, "epoch": 0.3915827446264843, "flos": 26359330181760.0, "grad_norm": 1.8014676974175234, "language_loss": 0.69625974, "learning_rate": 2.7782564708030647e-06, "loss": 0.71797216, "num_input_tokens_seen": 139890140, "step": 6513, "time_per_iteration": 2.8037526607513428 }, { "auxiliary_loss_clip": 0.01095461, "auxiliary_loss_mlp": 0.01038913, "balance_loss_clip": 1.04791474, "balance_loss_mlp": 1.02376771, "epoch": 0.39164286787915226, "flos": 21943897908480.0, "grad_norm": 8.577901504868834, "language_loss": 0.75566119, "learning_rate": 2.7778976903375464e-06, "loss": 0.77700496, "num_input_tokens_seen": 139908020, "step": 6514, "time_per_iteration": 2.8419485092163086 }, { "auxiliary_loss_clip": 0.01094835, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.04639578, "balance_loss_mlp": 1.02636766, "epoch": 0.3917029911318202, "flos": 16399829416320.0, "grad_norm": 2.188170768945522, "language_loss": 0.77334291, "learning_rate": 2.7775388803740693e-06, "loss": 0.79469454, "num_input_tokens_seen": 139926180, "step": 6515, "time_per_iteration": 2.7894155979156494 }, { "auxiliary_loss_clip": 0.01087017, "auxiliary_loss_mlp": 0.0105158, "balance_loss_clip": 1.03979194, "balance_loss_mlp": 1.03763223, "epoch": 0.3917631143844882, "flos": 26211701283840.0, "grad_norm": 1.5088395946363757, "language_loss": 0.79678488, "learning_rate": 2.7771800409262406e-06, "loss": 0.81817091, "num_input_tokens_seen": 139947420, "step": 6516, "time_per_iteration": 2.902660608291626 }, { "auxiliary_loss_clip": 0.01092649, "auxiliary_loss_mlp": 0.01042434, "balance_loss_clip": 1.04691982, "balance_loss_mlp": 1.02799749, "epoch": 0.39182323763715615, "flos": 18548364407040.0, "grad_norm": 1.9539461980584907, "language_loss": 0.70539331, "learning_rate": 2.7768211720076665e-06, "loss": 0.72674412, "num_input_tokens_seen": 139965800, "step": 6517, "time_per_iteration": 4.275412082672119 }, { "auxiliary_loss_clip": 0.0108795, "auxiliary_loss_mlp": 0.01045392, "balance_loss_clip": 1.04107618, "balance_loss_mlp": 1.03034759, "epoch": 0.3918833608898241, "flos": 34313543395200.0, "grad_norm": 1.7270068216094292, "language_loss": 0.72215492, "learning_rate": 2.776462273631956e-06, "loss": 0.74348831, "num_input_tokens_seen": 139988140, "step": 6518, "time_per_iteration": 4.390907287597656 }, { "auxiliary_loss_clip": 0.01124647, "auxiliary_loss_mlp": 0.0104138, "balance_loss_clip": 1.05179489, "balance_loss_mlp": 1.02679503, "epoch": 0.3919434841424921, "flos": 36939582812160.0, "grad_norm": 1.8265438315676477, "language_loss": 0.61835045, "learning_rate": 2.7761033458127177e-06, "loss": 0.64001071, "num_input_tokens_seen": 140010060, "step": 6519, "time_per_iteration": 4.281017780303955 }, { "auxiliary_loss_clip": 0.01142133, "auxiliary_loss_mlp": 0.01043415, "balance_loss_clip": 1.05199361, "balance_loss_mlp": 1.02807307, "epoch": 0.3920036073951601, "flos": 23508956373120.0, "grad_norm": 2.723028929016538, "language_loss": 0.67084813, "learning_rate": 2.775744388563563e-06, "loss": 0.6927036, "num_input_tokens_seen": 140029400, "step": 6520, "time_per_iteration": 2.6971800327301025 }, { "auxiliary_loss_clip": 0.01130641, "auxiliary_loss_mlp": 0.01040483, "balance_loss_clip": 1.04749501, "balance_loss_mlp": 1.02648759, "epoch": 0.39206373064782807, "flos": 18406086635520.0, "grad_norm": 1.8214273138880266, "language_loss": 0.78716481, "learning_rate": 2.775385401898104e-06, "loss": 0.80887604, "num_input_tokens_seen": 140048940, "step": 6521, "time_per_iteration": 2.69966459274292 }, { "auxiliary_loss_clip": 0.01128458, "auxiliary_loss_mlp": 0.01040156, "balance_loss_clip": 1.05050826, "balance_loss_mlp": 1.02289462, "epoch": 0.39212385390049603, "flos": 12313051608960.0, "grad_norm": 2.9673341059873897, "language_loss": 0.70119011, "learning_rate": 2.775026385829952e-06, "loss": 0.72287625, "num_input_tokens_seen": 140066380, "step": 6522, "time_per_iteration": 2.7100417613983154 }, { "auxiliary_loss_clip": 0.0110971, "auxiliary_loss_mlp": 0.01035612, "balance_loss_clip": 1.0467701, "balance_loss_mlp": 1.02100325, "epoch": 0.392183977153164, "flos": 19719160214400.0, "grad_norm": 2.0481488550445595, "language_loss": 0.76847959, "learning_rate": 2.774667340372722e-06, "loss": 0.78993279, "num_input_tokens_seen": 140085275, "step": 6523, "time_per_iteration": 4.336375713348389 }, { "auxiliary_loss_clip": 0.01111577, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 1.04617906, "balance_loss_mlp": 1.02597904, "epoch": 0.39224410040583196, "flos": 33144902403840.0, "grad_norm": 2.4064695780458254, "language_loss": 0.62052447, "learning_rate": 2.7743082655400293e-06, "loss": 0.64204991, "num_input_tokens_seen": 140105105, "step": 6524, "time_per_iteration": 2.861999750137329 }, { "auxiliary_loss_clip": 0.0113421, "auxiliary_loss_mlp": 0.01041444, "balance_loss_clip": 1.04792655, "balance_loss_mlp": 1.02591681, "epoch": 0.39230422365849993, "flos": 27782434097280.0, "grad_norm": 3.311294983146634, "language_loss": 0.74027938, "learning_rate": 2.773949161345489e-06, "loss": 0.76203597, "num_input_tokens_seen": 140125645, "step": 6525, "time_per_iteration": 2.6660265922546387 }, { "auxiliary_loss_clip": 0.01111123, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.04621911, "balance_loss_mlp": 1.02488267, "epoch": 0.3923643469111679, "flos": 17931634865280.0, "grad_norm": 1.9378599466790423, "language_loss": 0.81101322, "learning_rate": 2.773590027802719e-06, "loss": 0.83251387, "num_input_tokens_seen": 140141925, "step": 6526, "time_per_iteration": 2.6949198246002197 }, { "auxiliary_loss_clip": 0.01122115, "auxiliary_loss_mlp": 0.01043128, "balance_loss_clip": 1.04750228, "balance_loss_mlp": 1.02844119, "epoch": 0.39242447016383586, "flos": 24059539019520.0, "grad_norm": 2.21390394508072, "language_loss": 0.69860446, "learning_rate": 2.7732308649253383e-06, "loss": 0.72025692, "num_input_tokens_seen": 140160965, "step": 6527, "time_per_iteration": 2.648738384246826 }, { "auxiliary_loss_clip": 0.01093845, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.04563034, "balance_loss_mlp": 1.01990485, "epoch": 0.3924845934165038, "flos": 10664069016960.0, "grad_norm": 2.870547931880311, "language_loss": 0.82659566, "learning_rate": 2.772871672726965e-06, "loss": 0.84787941, "num_input_tokens_seen": 140177780, "step": 6528, "time_per_iteration": 2.7436537742614746 }, { "auxiliary_loss_clip": 0.01105744, "auxiliary_loss_mlp": 0.01032675, "balance_loss_clip": 1.04709864, "balance_loss_mlp": 1.01909113, "epoch": 0.3925447166691718, "flos": 31245910174080.0, "grad_norm": 1.7012894335018593, "language_loss": 0.68846285, "learning_rate": 2.7725124512212205e-06, "loss": 0.70984709, "num_input_tokens_seen": 140201660, "step": 6529, "time_per_iteration": 2.7932794094085693 }, { "auxiliary_loss_clip": 0.01112194, "auxiliary_loss_mlp": 0.01035865, "balance_loss_clip": 1.04500198, "balance_loss_mlp": 1.02043366, "epoch": 0.39260483992183975, "flos": 29415040087680.0, "grad_norm": 2.4176127237752145, "language_loss": 0.80461496, "learning_rate": 2.7721532004217267e-06, "loss": 0.82609558, "num_input_tokens_seen": 140218585, "step": 6530, "time_per_iteration": 2.7094242572784424 }, { "auxiliary_loss_clip": 0.01119536, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.04586959, "balance_loss_mlp": 1.0264107, "epoch": 0.3926649631745077, "flos": 22857788666880.0, "grad_norm": 1.6828565274400475, "language_loss": 0.75680822, "learning_rate": 2.7717939203421063e-06, "loss": 0.77841288, "num_input_tokens_seen": 140239905, "step": 6531, "time_per_iteration": 2.7238411903381348 }, { "auxiliary_loss_clip": 0.01058847, "auxiliary_loss_mlp": 0.01008064, "balance_loss_clip": 1.03009987, "balance_loss_mlp": 1.00663972, "epoch": 0.3927250864271757, "flos": 63893881872000.0, "grad_norm": 0.8211432271778524, "language_loss": 0.60317427, "learning_rate": 2.7714346109959822e-06, "loss": 0.62384337, "num_input_tokens_seen": 140293820, "step": 6532, "time_per_iteration": 3.047954797744751 }, { "auxiliary_loss_clip": 0.01037233, "auxiliary_loss_mlp": 0.01004719, "balance_loss_clip": 1.02873898, "balance_loss_mlp": 1.00334251, "epoch": 0.3927852096798437, "flos": 68909741890560.0, "grad_norm": 0.7803139799058858, "language_loss": 0.55459583, "learning_rate": 2.771075272396981e-06, "loss": 0.57501537, "num_input_tokens_seen": 140360420, "step": 6533, "time_per_iteration": 3.306561231613159 }, { "auxiliary_loss_clip": 0.01112553, "auxiliary_loss_mlp": 0.01040733, "balance_loss_clip": 1.04983759, "balance_loss_mlp": 1.02614141, "epoch": 0.39284533293251167, "flos": 29715972232320.0, "grad_norm": 2.2467248181922232, "language_loss": 0.75955313, "learning_rate": 2.7707159045587284e-06, "loss": 0.78108597, "num_input_tokens_seen": 140381950, "step": 6534, "time_per_iteration": 2.7788329124450684 }, { "auxiliary_loss_clip": 0.0112134, "auxiliary_loss_mlp": 0.01045716, "balance_loss_clip": 1.04698312, "balance_loss_mlp": 1.02866912, "epoch": 0.39290545618517964, "flos": 18552027594240.0, "grad_norm": 2.2080736338994944, "language_loss": 0.78123498, "learning_rate": 2.770356507494851e-06, "loss": 0.80290556, "num_input_tokens_seen": 140399410, "step": 6535, "time_per_iteration": 2.6949005126953125 }, { "auxiliary_loss_clip": 0.0109337, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.04779291, "balance_loss_mlp": 1.01950169, "epoch": 0.3929655794378476, "flos": 26249479413120.0, "grad_norm": 1.9769476518607105, "language_loss": 0.686719, "learning_rate": 2.769997081218978e-06, "loss": 0.7079792, "num_input_tokens_seen": 140419055, "step": 6536, "time_per_iteration": 2.7684245109558105 }, { "auxiliary_loss_clip": 0.01104946, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.04767156, "balance_loss_mlp": 1.02469027, "epoch": 0.39302570269051557, "flos": 29277933874560.0, "grad_norm": 1.8012856746153256, "language_loss": 0.69048655, "learning_rate": 2.769637625744738e-06, "loss": 0.71191454, "num_input_tokens_seen": 140438800, "step": 6537, "time_per_iteration": 2.7638440132141113 }, { "auxiliary_loss_clip": 0.01122897, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.05155134, "balance_loss_mlp": 1.02624357, "epoch": 0.39308582594318353, "flos": 17347440067200.0, "grad_norm": 1.7514361880438423, "language_loss": 0.78990901, "learning_rate": 2.769278141085763e-06, "loss": 0.81154549, "num_input_tokens_seen": 140456880, "step": 6538, "time_per_iteration": 2.635075807571411 }, { "auxiliary_loss_clip": 0.01003397, "auxiliary_loss_mlp": 0.01017351, "balance_loss_clip": 1.02259159, "balance_loss_mlp": 1.01596797, "epoch": 0.3931459491958515, "flos": 61007094650880.0, "grad_norm": 0.8098068956453415, "language_loss": 0.6190061, "learning_rate": 2.768918627255683e-06, "loss": 0.63921356, "num_input_tokens_seen": 140507510, "step": 6539, "time_per_iteration": 3.0673203468322754 }, { "auxiliary_loss_clip": 0.01104217, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.04730296, "balance_loss_mlp": 1.0206002, "epoch": 0.39320607244851946, "flos": 39016009249920.0, "grad_norm": 3.0347619755245248, "language_loss": 0.68405032, "learning_rate": 2.7685590842681315e-06, "loss": 0.70544618, "num_input_tokens_seen": 140528740, "step": 6540, "time_per_iteration": 2.7993643283843994 }, { "auxiliary_loss_clip": 0.01105128, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.04439306, "balance_loss_mlp": 1.01638293, "epoch": 0.3932661957011874, "flos": 24679752180480.0, "grad_norm": 1.8325322608278536, "language_loss": 0.7276125, "learning_rate": 2.7681995121367433e-06, "loss": 0.74897116, "num_input_tokens_seen": 140547560, "step": 6541, "time_per_iteration": 2.659224510192871 }, { "auxiliary_loss_clip": 0.01054751, "auxiliary_loss_mlp": 0.01009472, "balance_loss_clip": 1.02648139, "balance_loss_mlp": 1.0080775, "epoch": 0.3933263189538554, "flos": 70096552185600.0, "grad_norm": 0.8313029932067456, "language_loss": 0.60319722, "learning_rate": 2.7678399108751516e-06, "loss": 0.6238395, "num_input_tokens_seen": 140601175, "step": 6542, "time_per_iteration": 2.968062400817871 }, { "auxiliary_loss_clip": 0.01121623, "auxiliary_loss_mlp": 0.01038302, "balance_loss_clip": 1.04764903, "balance_loss_mlp": 1.0243547, "epoch": 0.39338644220652336, "flos": 22929071207040.0, "grad_norm": 1.6209695943494522, "language_loss": 0.82034504, "learning_rate": 2.7674802804969947e-06, "loss": 0.84194422, "num_input_tokens_seen": 140622200, "step": 6543, "time_per_iteration": 2.638796806335449 }, { "auxiliary_loss_clip": 0.01103923, "auxiliary_loss_mlp": 0.01034902, "balance_loss_clip": 1.04355097, "balance_loss_mlp": 1.02045417, "epoch": 0.3934465654591913, "flos": 30848163897600.0, "grad_norm": 3.743075543188527, "language_loss": 0.69100285, "learning_rate": 2.767120621015908e-06, "loss": 0.71239114, "num_input_tokens_seen": 140643125, "step": 6544, "time_per_iteration": 2.7180936336517334 }, { "auxiliary_loss_clip": 0.01112442, "auxiliary_loss_mlp": 0.01047198, "balance_loss_clip": 1.04659534, "balance_loss_mlp": 1.0316174, "epoch": 0.3935066887118593, "flos": 29236528471680.0, "grad_norm": 2.0996268311737976, "language_loss": 0.76072371, "learning_rate": 2.76676093244553e-06, "loss": 0.78232014, "num_input_tokens_seen": 140662500, "step": 6545, "time_per_iteration": 2.7429869174957275 }, { "auxiliary_loss_clip": 0.01091051, "auxiliary_loss_mlp": 0.01033724, "balance_loss_clip": 1.04633403, "balance_loss_mlp": 1.02104044, "epoch": 0.3935668119645273, "flos": 19135288638720.0, "grad_norm": 1.7673371756448844, "language_loss": 0.74672133, "learning_rate": 2.7664012147995015e-06, "loss": 0.76796907, "num_input_tokens_seen": 140681960, "step": 6546, "time_per_iteration": 2.6785295009613037 }, { "auxiliary_loss_clip": 0.01109428, "auxiliary_loss_mlp": 0.0103425, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.01946843, "epoch": 0.3936269352171953, "flos": 18516116972160.0, "grad_norm": 1.9230817449169166, "language_loss": 0.81627518, "learning_rate": 2.7660414680914617e-06, "loss": 0.83771199, "num_input_tokens_seen": 140699170, "step": 6547, "time_per_iteration": 2.638214588165283 }, { "auxiliary_loss_clip": 0.01114598, "auxiliary_loss_mlp": 0.00772919, "balance_loss_clip": 1.04404151, "balance_loss_mlp": 1.00032711, "epoch": 0.39368705846986324, "flos": 15632813370240.0, "grad_norm": 1.9821442562566327, "language_loss": 0.84406352, "learning_rate": 2.7656816923350525e-06, "loss": 0.86293864, "num_input_tokens_seen": 140714920, "step": 6548, "time_per_iteration": 2.6490747928619385 }, { "auxiliary_loss_clip": 0.01118074, "auxiliary_loss_mlp": 0.00771091, "balance_loss_clip": 1.04686236, "balance_loss_mlp": 1.00034189, "epoch": 0.3937471817225312, "flos": 21325839563520.0, "grad_norm": 1.7617733187332765, "language_loss": 0.7311933, "learning_rate": 2.7653218875439174e-06, "loss": 0.75008494, "num_input_tokens_seen": 140734595, "step": 6549, "time_per_iteration": 2.635380983352661 }, { "auxiliary_loss_clip": 0.01071621, "auxiliary_loss_mlp": 0.01042928, "balance_loss_clip": 1.0444963, "balance_loss_mlp": 1.0259527, "epoch": 0.39380730497519917, "flos": 20776693461120.0, "grad_norm": 2.774519883144605, "language_loss": 0.77592897, "learning_rate": 2.764962053731699e-06, "loss": 0.7970745, "num_input_tokens_seen": 140754050, "step": 6550, "time_per_iteration": 2.733921527862549 }, { "auxiliary_loss_clip": 0.01095205, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.04455531, "balance_loss_mlp": 1.01674485, "epoch": 0.39386742822786713, "flos": 21609784575360.0, "grad_norm": 3.1837220930493517, "language_loss": 0.81144142, "learning_rate": 2.7646021909120434e-06, "loss": 0.83270073, "num_input_tokens_seen": 140771440, "step": 6551, "time_per_iteration": 2.851475238800049 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01036299, "balance_loss_clip": 1.0443331, "balance_loss_mlp": 1.02188659, "epoch": 0.3939275514805351, "flos": 12414642249600.0, "grad_norm": 12.177431380433415, "language_loss": 0.80449802, "learning_rate": 2.764242299098596e-06, "loss": 0.82603723, "num_input_tokens_seen": 140786715, "step": 6552, "time_per_iteration": 2.667344570159912 }, { "auxiliary_loss_clip": 0.01133223, "auxiliary_loss_mlp": 0.01043273, "balance_loss_clip": 1.04791522, "balance_loss_mlp": 1.02883697, "epoch": 0.39398767473320306, "flos": 18552027594240.0, "grad_norm": 2.002210962432939, "language_loss": 0.71199149, "learning_rate": 2.763882378305003e-06, "loss": 0.73375642, "num_input_tokens_seen": 140804950, "step": 6553, "time_per_iteration": 2.6329705715179443 }, { "auxiliary_loss_clip": 0.0111827, "auxiliary_loss_mlp": 0.0077145, "balance_loss_clip": 1.04818738, "balance_loss_mlp": 1.00036502, "epoch": 0.39404779798587103, "flos": 29308888419840.0, "grad_norm": 4.200797737547303, "language_loss": 0.64058566, "learning_rate": 2.7635224285449144e-06, "loss": 0.65948284, "num_input_tokens_seen": 140822800, "step": 6554, "time_per_iteration": 2.7190303802490234 }, { "auxiliary_loss_clip": 0.01109713, "auxiliary_loss_mlp": 0.01041117, "balance_loss_clip": 1.04655266, "balance_loss_mlp": 1.02747416, "epoch": 0.394107921238539, "flos": 34897055834880.0, "grad_norm": 2.186636266316066, "language_loss": 0.78957009, "learning_rate": 2.7631624498319796e-06, "loss": 0.81107843, "num_input_tokens_seen": 140842940, "step": 6555, "time_per_iteration": 2.7675819396972656 }, { "auxiliary_loss_clip": 0.01102424, "auxiliary_loss_mlp": 0.0104302, "balance_loss_clip": 1.04469514, "balance_loss_mlp": 1.02758873, "epoch": 0.39416804449120696, "flos": 25081413039360.0, "grad_norm": 1.7945119387028163, "language_loss": 0.71689165, "learning_rate": 2.7628024421798473e-06, "loss": 0.7383461, "num_input_tokens_seen": 140863060, "step": 6556, "time_per_iteration": 4.261122703552246 }, { "auxiliary_loss_clip": 0.01129248, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.0445503, "balance_loss_mlp": 1.01749015, "epoch": 0.3942281677438749, "flos": 32306639731200.0, "grad_norm": 1.7970895618407805, "language_loss": 0.84080362, "learning_rate": 2.7624424056021705e-06, "loss": 0.86241317, "num_input_tokens_seen": 140883795, "step": 6557, "time_per_iteration": 2.7031610012054443 }, { "auxiliary_loss_clip": 0.01116561, "auxiliary_loss_mlp": 0.01032116, "balance_loss_clip": 1.04790783, "balance_loss_mlp": 1.01810956, "epoch": 0.3942882909965429, "flos": 24936621315840.0, "grad_norm": 3.8501140650976238, "language_loss": 0.806759, "learning_rate": 2.7620823401126004e-06, "loss": 0.82824582, "num_input_tokens_seen": 140903055, "step": 6558, "time_per_iteration": 5.6523637771606445 }, { "auxiliary_loss_clip": 0.01130051, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.04807055, "balance_loss_mlp": 1.02238965, "epoch": 0.39434841424921085, "flos": 11874797769600.0, "grad_norm": 1.8974962376031472, "language_loss": 0.70930403, "learning_rate": 2.761722245724792e-06, "loss": 0.73096335, "num_input_tokens_seen": 140920685, "step": 6559, "time_per_iteration": 2.6645302772521973 }, { "auxiliary_loss_clip": 0.01113668, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.04660964, "balance_loss_mlp": 1.02452326, "epoch": 0.3944085375018789, "flos": 16361620323840.0, "grad_norm": 2.3002241644217865, "language_loss": 0.80355662, "learning_rate": 2.7613621224524003e-06, "loss": 0.82510054, "num_input_tokens_seen": 140937320, "step": 6560, "time_per_iteration": 2.8372745513916016 }, { "auxiliary_loss_clip": 0.01109469, "auxiliary_loss_mlp": 0.0103941, "balance_loss_clip": 1.04681468, "balance_loss_mlp": 1.02334619, "epoch": 0.39446866075454684, "flos": 10633365866880.0, "grad_norm": 2.2192317359233034, "language_loss": 0.828062, "learning_rate": 2.7610019703090803e-06, "loss": 0.84955078, "num_input_tokens_seen": 140954855, "step": 6561, "time_per_iteration": 2.6724014282226562 }, { "auxiliary_loss_clip": 0.01119263, "auxiliary_loss_mlp": 0.01043889, "balance_loss_clip": 1.04620779, "balance_loss_mlp": 1.02972126, "epoch": 0.3945287840072148, "flos": 18187498419840.0, "grad_norm": 2.478683034492453, "language_loss": 0.80985552, "learning_rate": 2.7606417893084887e-06, "loss": 0.83148706, "num_input_tokens_seen": 140973250, "step": 6562, "time_per_iteration": 4.211291074752808 }, { "auxiliary_loss_clip": 0.01100981, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.04367661, "balance_loss_mlp": 1.02568245, "epoch": 0.39458890725988277, "flos": 23039891642880.0, "grad_norm": 1.8396668644534004, "language_loss": 0.81574059, "learning_rate": 2.7602815794642853e-06, "loss": 0.83715415, "num_input_tokens_seen": 140993050, "step": 6563, "time_per_iteration": 2.6933205127716064 }, { "auxiliary_loss_clip": 0.01078578, "auxiliary_loss_mlp": 0.01052866, "balance_loss_clip": 1.03979552, "balance_loss_mlp": 1.03385234, "epoch": 0.39464903051255074, "flos": 17159052211200.0, "grad_norm": 2.4284687703059276, "language_loss": 0.69678622, "learning_rate": 2.759921340790127e-06, "loss": 0.71810067, "num_input_tokens_seen": 141010815, "step": 6564, "time_per_iteration": 2.7754619121551514 }, { "auxiliary_loss_clip": 0.01119553, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.04547322, "balance_loss_mlp": 1.02260029, "epoch": 0.3947091537652187, "flos": 15889000147200.0, "grad_norm": 2.342409184231709, "language_loss": 0.82842124, "learning_rate": 2.759561073299676e-06, "loss": 0.84999526, "num_input_tokens_seen": 141028720, "step": 6565, "time_per_iteration": 2.652029037475586 }, { "auxiliary_loss_clip": 0.01091527, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 1.04201448, "balance_loss_mlp": 1.02794445, "epoch": 0.39476927701788667, "flos": 18545491319040.0, "grad_norm": 1.8313066364371182, "language_loss": 0.83458865, "learning_rate": 2.7592007770065937e-06, "loss": 0.85594487, "num_input_tokens_seen": 141046025, "step": 6566, "time_per_iteration": 2.6853299140930176 }, { "auxiliary_loss_clip": 0.01137834, "auxiliary_loss_mlp": 0.01036947, "balance_loss_clip": 1.04882693, "balance_loss_mlp": 1.02146816, "epoch": 0.39482940027055463, "flos": 22275712771200.0, "grad_norm": 2.7953182854439973, "language_loss": 0.77462149, "learning_rate": 2.7588404519245403e-06, "loss": 0.79636931, "num_input_tokens_seen": 141066865, "step": 6567, "time_per_iteration": 2.6695878505706787 }, { "auxiliary_loss_clip": 0.01114738, "auxiliary_loss_mlp": 0.01037774, "balance_loss_clip": 1.04457474, "balance_loss_mlp": 1.0235877, "epoch": 0.3948895235232226, "flos": 14757634494720.0, "grad_norm": 3.2000391748281065, "language_loss": 0.80752146, "learning_rate": 2.758480098067182e-06, "loss": 0.82904655, "num_input_tokens_seen": 141084210, "step": 6568, "time_per_iteration": 2.6126980781555176 }, { "auxiliary_loss_clip": 0.01100656, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.04693437, "balance_loss_mlp": 1.02142143, "epoch": 0.39494964677589056, "flos": 22565763095040.0, "grad_norm": 3.155903507973262, "language_loss": 0.846977, "learning_rate": 2.7581197154481816e-06, "loss": 0.868343, "num_input_tokens_seen": 141103895, "step": 6569, "time_per_iteration": 2.731241464614868 }, { "auxiliary_loss_clip": 0.01076285, "auxiliary_loss_mlp": 0.01045444, "balance_loss_clip": 1.046417, "balance_loss_mlp": 1.03076911, "epoch": 0.3950097700285585, "flos": 22963186149120.0, "grad_norm": 2.966787083651573, "language_loss": 0.74931526, "learning_rate": 2.7577593040812066e-06, "loss": 0.77053261, "num_input_tokens_seen": 141124000, "step": 6570, "time_per_iteration": 2.816168785095215 }, { "auxiliary_loss_clip": 0.01093382, "auxiliary_loss_mlp": 0.01037489, "balance_loss_clip": 1.04271865, "balance_loss_mlp": 1.02224803, "epoch": 0.3950698932812265, "flos": 20595236929920.0, "grad_norm": 3.807643490882315, "language_loss": 0.80009687, "learning_rate": 2.757398863979922e-06, "loss": 0.82140559, "num_input_tokens_seen": 141142535, "step": 6571, "time_per_iteration": 2.7444143295288086 }, { "auxiliary_loss_clip": 0.0110309, "auxiliary_loss_mlp": 0.01042438, "balance_loss_clip": 1.046592, "balance_loss_mlp": 1.02792382, "epoch": 0.39513001653389446, "flos": 20375786787840.0, "grad_norm": 2.0513494110156105, "language_loss": 0.77667749, "learning_rate": 2.757038395157997e-06, "loss": 0.79813272, "num_input_tokens_seen": 141161575, "step": 6572, "time_per_iteration": 2.787951946258545 }, { "auxiliary_loss_clip": 0.01096298, "auxiliary_loss_mlp": 0.01039178, "balance_loss_clip": 1.04524946, "balance_loss_mlp": 1.02422285, "epoch": 0.3951901397865625, "flos": 26463650256000.0, "grad_norm": 2.2233910711840092, "language_loss": 0.74710405, "learning_rate": 2.7566778976291002e-06, "loss": 0.76845872, "num_input_tokens_seen": 141181150, "step": 6573, "time_per_iteration": 2.8065271377563477 }, { "auxiliary_loss_clip": 0.01119667, "auxiliary_loss_mlp": 0.01033875, "balance_loss_clip": 1.04583275, "balance_loss_mlp": 1.02073228, "epoch": 0.39525026303923044, "flos": 43838345767680.0, "grad_norm": 1.5702623893020402, "language_loss": 0.681665, "learning_rate": 2.7563173714069017e-06, "loss": 0.7032004, "num_input_tokens_seen": 141206310, "step": 6574, "time_per_iteration": 2.917938470840454 }, { "auxiliary_loss_clip": 0.01066027, "auxiliary_loss_mlp": 0.01046829, "balance_loss_clip": 1.03601551, "balance_loss_mlp": 1.02941298, "epoch": 0.3953103862918984, "flos": 18040803275520.0, "grad_norm": 11.51359836007049, "language_loss": 0.71934754, "learning_rate": 2.755956816505072e-06, "loss": 0.74047613, "num_input_tokens_seen": 141223925, "step": 6575, "time_per_iteration": 2.8125574588775635 }, { "auxiliary_loss_clip": 0.01106625, "auxiliary_loss_mlp": 0.01044084, "balance_loss_clip": 1.04328454, "balance_loss_mlp": 1.02871156, "epoch": 0.3953705095445664, "flos": 16976015481600.0, "grad_norm": 2.3082458130711276, "language_loss": 0.73497486, "learning_rate": 2.7555962329372845e-06, "loss": 0.75648189, "num_input_tokens_seen": 141239010, "step": 6576, "time_per_iteration": 2.7072994709014893 }, { "auxiliary_loss_clip": 0.01131853, "auxiliary_loss_mlp": 0.01038072, "balance_loss_clip": 1.04721868, "balance_loss_mlp": 1.02482772, "epoch": 0.39543063279723434, "flos": 17411144837760.0, "grad_norm": 2.584581612312142, "language_loss": 0.83806884, "learning_rate": 2.7552356207172124e-06, "loss": 0.85976809, "num_input_tokens_seen": 141252255, "step": 6577, "time_per_iteration": 2.673980236053467 }, { "auxiliary_loss_clip": 0.01108115, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.04473734, "balance_loss_mlp": 1.02394366, "epoch": 0.3954907560499023, "flos": 22784207656320.0, "grad_norm": 3.282604232183532, "language_loss": 0.90597945, "learning_rate": 2.75487497985853e-06, "loss": 0.92744309, "num_input_tokens_seen": 141269325, "step": 6578, "time_per_iteration": 2.8357715606689453 }, { "auxiliary_loss_clip": 0.01113431, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.04971015, "balance_loss_mlp": 1.0215559, "epoch": 0.39555087930257027, "flos": 21944400698880.0, "grad_norm": 1.9328811386040925, "language_loss": 0.77836883, "learning_rate": 2.7545143103749117e-06, "loss": 0.7998836, "num_input_tokens_seen": 141288505, "step": 6579, "time_per_iteration": 2.78900146484375 }, { "auxiliary_loss_clip": 0.01080071, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.04296517, "balance_loss_mlp": 1.02181292, "epoch": 0.39561100255523823, "flos": 20404622430720.0, "grad_norm": 2.0515288557813705, "language_loss": 0.68375254, "learning_rate": 2.754153612280037e-06, "loss": 0.70492923, "num_input_tokens_seen": 141303680, "step": 6580, "time_per_iteration": 2.796602249145508 }, { "auxiliary_loss_clip": 0.01119101, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.04687381, "balance_loss_mlp": 1.01770234, "epoch": 0.3956711258079062, "flos": 27964572986880.0, "grad_norm": 5.6192422063497425, "language_loss": 0.58592093, "learning_rate": 2.7537928855875797e-06, "loss": 0.60742974, "num_input_tokens_seen": 141324090, "step": 6581, "time_per_iteration": 2.738732099533081 }, { "auxiliary_loss_clip": 0.0110807, "auxiliary_loss_mlp": 0.01047889, "balance_loss_clip": 1.04554892, "balance_loss_mlp": 1.03111625, "epoch": 0.39573124906057416, "flos": 14428297670400.0, "grad_norm": 1.840388254222325, "language_loss": 0.69687581, "learning_rate": 2.7534321303112224e-06, "loss": 0.71843535, "num_input_tokens_seen": 141342235, "step": 6582, "time_per_iteration": 2.74564790725708 }, { "auxiliary_loss_clip": 0.01132763, "auxiliary_loss_mlp": 0.0077198, "balance_loss_clip": 1.04670966, "balance_loss_mlp": 1.00066948, "epoch": 0.39579137231324213, "flos": 18733699607040.0, "grad_norm": 2.093309053458098, "language_loss": 0.76838243, "learning_rate": 2.753071346464642e-06, "loss": 0.78742981, "num_input_tokens_seen": 141361195, "step": 6583, "time_per_iteration": 2.6127665042877197 }, { "auxiliary_loss_clip": 0.01084294, "auxiliary_loss_mlp": 0.00772199, "balance_loss_clip": 1.04135418, "balance_loss_mlp": 1.00058353, "epoch": 0.3958514955659101, "flos": 17676417755520.0, "grad_norm": 2.422087879109688, "language_loss": 0.66005278, "learning_rate": 2.7527105340615207e-06, "loss": 0.67861772, "num_input_tokens_seen": 141378275, "step": 6584, "time_per_iteration": 2.8412790298461914 }, { "auxiliary_loss_clip": 0.0109769, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.04634333, "balance_loss_mlp": 1.02687716, "epoch": 0.39591161881857806, "flos": 29309103901440.0, "grad_norm": 7.452692779947077, "language_loss": 0.72775561, "learning_rate": 2.7523496931155413e-06, "loss": 0.74916053, "num_input_tokens_seen": 141396960, "step": 6585, "time_per_iteration": 2.8504436016082764 }, { "auxiliary_loss_clip": 0.0109915, "auxiliary_loss_mlp": 0.01041099, "balance_loss_clip": 1.04335117, "balance_loss_mlp": 1.02628136, "epoch": 0.3959717420712461, "flos": 25771831332480.0, "grad_norm": 1.8603715362450812, "language_loss": 0.73381901, "learning_rate": 2.7519888236403856e-06, "loss": 0.75522149, "num_input_tokens_seen": 141417320, "step": 6586, "time_per_iteration": 2.8426311016082764 }, { "auxiliary_loss_clip": 0.01101854, "auxiliary_loss_mlp": 0.0103792, "balance_loss_clip": 1.04255629, "balance_loss_mlp": 1.02266693, "epoch": 0.39603186532391405, "flos": 20923783655040.0, "grad_norm": 2.174728382433504, "language_loss": 0.71447468, "learning_rate": 2.7516279256497382e-06, "loss": 0.73587245, "num_input_tokens_seen": 141435985, "step": 6587, "time_per_iteration": 2.7798478603363037 }, { "auxiliary_loss_clip": 0.01007869, "auxiliary_loss_mlp": 0.01003214, "balance_loss_clip": 1.02249742, "balance_loss_mlp": 1.00195026, "epoch": 0.396091988576582, "flos": 54880986176640.0, "grad_norm": 0.9478406102040471, "language_loss": 0.61186492, "learning_rate": 2.751266999157285e-06, "loss": 0.63197577, "num_input_tokens_seen": 141486075, "step": 6588, "time_per_iteration": 3.1663742065429688 }, { "auxiliary_loss_clip": 0.0110963, "auxiliary_loss_mlp": 0.00772247, "balance_loss_clip": 1.04547548, "balance_loss_mlp": 1.0006907, "epoch": 0.39615211182925, "flos": 20702896968960.0, "grad_norm": 3.1004380492305206, "language_loss": 0.81686854, "learning_rate": 2.7509060441767115e-06, "loss": 0.8356874, "num_input_tokens_seen": 141505280, "step": 6589, "time_per_iteration": 2.7711055278778076 }, { "auxiliary_loss_clip": 0.01106228, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.04562962, "balance_loss_mlp": 1.02241325, "epoch": 0.39621223508191794, "flos": 20994312009600.0, "grad_norm": 2.2429889858322802, "language_loss": 0.69913912, "learning_rate": 2.7505450607217057e-06, "loss": 0.72058284, "num_input_tokens_seen": 141523930, "step": 6590, "time_per_iteration": 2.793330669403076 }, { "auxiliary_loss_clip": 0.01117633, "auxiliary_loss_mlp": 0.01056421, "balance_loss_clip": 1.04669666, "balance_loss_mlp": 1.03980339, "epoch": 0.3962723583345859, "flos": 23368833417600.0, "grad_norm": 1.772211549409949, "language_loss": 0.75809395, "learning_rate": 2.750184048805956e-06, "loss": 0.77983451, "num_input_tokens_seen": 141541320, "step": 6591, "time_per_iteration": 2.7317981719970703 }, { "auxiliary_loss_clip": 0.01043506, "auxiliary_loss_mlp": 0.01049181, "balance_loss_clip": 1.03802264, "balance_loss_mlp": 1.03364813, "epoch": 0.39633248158725387, "flos": 25115599808640.0, "grad_norm": 2.064980952243903, "language_loss": 0.78466719, "learning_rate": 2.749823008443152e-06, "loss": 0.80559409, "num_input_tokens_seen": 141561880, "step": 6592, "time_per_iteration": 3.192194700241089 }, { "auxiliary_loss_clip": 0.01059924, "auxiliary_loss_mlp": 0.01033868, "balance_loss_clip": 1.03984666, "balance_loss_mlp": 1.01872826, "epoch": 0.39639260483992184, "flos": 39787622236800.0, "grad_norm": 1.9568402514544967, "language_loss": 0.69690341, "learning_rate": 2.7494619396469843e-06, "loss": 0.71784127, "num_input_tokens_seen": 141586460, "step": 6593, "time_per_iteration": 3.365752696990967 }, { "auxiliary_loss_clip": 0.01059564, "auxiliary_loss_mlp": 0.01046377, "balance_loss_clip": 1.03668404, "balance_loss_mlp": 1.03035569, "epoch": 0.3964527280925898, "flos": 17347045017600.0, "grad_norm": 1.624713370756075, "language_loss": 0.77905881, "learning_rate": 2.7491008424311452e-06, "loss": 0.80011821, "num_input_tokens_seen": 141605955, "step": 6594, "time_per_iteration": 2.890626907348633 }, { "auxiliary_loss_clip": 0.01025812, "auxiliary_loss_mlp": 0.01003509, "balance_loss_clip": 1.02550435, "balance_loss_mlp": 1.00200129, "epoch": 0.39651285134525777, "flos": 71717848369920.0, "grad_norm": 0.9363100872746896, "language_loss": 0.6304667, "learning_rate": 2.7487397168093265e-06, "loss": 0.65075988, "num_input_tokens_seen": 141673140, "step": 6595, "time_per_iteration": 3.3955094814300537 }, { "auxiliary_loss_clip": 0.01096586, "auxiliary_loss_mlp": 0.01055368, "balance_loss_clip": 1.0442034, "balance_loss_mlp": 1.03774858, "epoch": 0.39657297459792573, "flos": 25775710001280.0, "grad_norm": 2.5609780352809732, "language_loss": 0.63787287, "learning_rate": 2.748378562795223e-06, "loss": 0.65939242, "num_input_tokens_seen": 141692955, "step": 6596, "time_per_iteration": 4.60092568397522 }, { "auxiliary_loss_clip": 0.01120147, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.04657853, "balance_loss_mlp": 1.02747798, "epoch": 0.3966330978505937, "flos": 20266115587200.0, "grad_norm": 2.0315739566024567, "language_loss": 0.79006839, "learning_rate": 2.7480173804025293e-06, "loss": 0.81169188, "num_input_tokens_seen": 141710680, "step": 6597, "time_per_iteration": 5.807824373245239 }, { "auxiliary_loss_clip": 0.01099639, "auxiliary_loss_mlp": 0.00773402, "balance_loss_clip": 1.04352951, "balance_loss_mlp": 1.00076032, "epoch": 0.39669322110326166, "flos": 20631183465600.0, "grad_norm": 2.966898609781474, "language_loss": 0.6772182, "learning_rate": 2.747656169644941e-06, "loss": 0.69594866, "num_input_tokens_seen": 141729860, "step": 6598, "time_per_iteration": 2.786884307861328 }, { "auxiliary_loss_clip": 0.01129462, "auxiliary_loss_mlp": 0.01041455, "balance_loss_clip": 1.04473436, "balance_loss_mlp": 1.02785325, "epoch": 0.3967533443559297, "flos": 21726063878400.0, "grad_norm": 2.1804433985902247, "language_loss": 0.79342777, "learning_rate": 2.747294930536157e-06, "loss": 0.81513697, "num_input_tokens_seen": 141749060, "step": 6599, "time_per_iteration": 2.6758370399475098 }, { "auxiliary_loss_clip": 0.01091573, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.04314208, "balance_loss_mlp": 1.02487051, "epoch": 0.39681346760859765, "flos": 25484151306240.0, "grad_norm": 2.279505844275463, "language_loss": 0.72878486, "learning_rate": 2.7469336630898737e-06, "loss": 0.75011677, "num_input_tokens_seen": 141769860, "step": 6600, "time_per_iteration": 2.7616889476776123 }, { "auxiliary_loss_clip": 0.01083152, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.03626251, "balance_loss_mlp": 1.0220201, "epoch": 0.3968735908612656, "flos": 20959586536320.0, "grad_norm": 2.0515710245603938, "language_loss": 0.85973942, "learning_rate": 2.746572367319791e-06, "loss": 0.88094509, "num_input_tokens_seen": 141788465, "step": 6601, "time_per_iteration": 2.755791664123535 }, { "auxiliary_loss_clip": 0.01095713, "auxiliary_loss_mlp": 0.01041964, "balance_loss_clip": 1.0429877, "balance_loss_mlp": 1.02468467, "epoch": 0.3969337141139336, "flos": 10707090531840.0, "grad_norm": 2.240549855963289, "language_loss": 0.70372766, "learning_rate": 2.7462110432396095e-06, "loss": 0.72510445, "num_input_tokens_seen": 141804955, "step": 6602, "time_per_iteration": 4.643726348876953 }, { "auxiliary_loss_clip": 0.01133428, "auxiliary_loss_mlp": 0.01047809, "balance_loss_clip": 1.04658508, "balance_loss_mlp": 1.03230548, "epoch": 0.39699383736660154, "flos": 17593714690560.0, "grad_norm": 3.7711392584572896, "language_loss": 0.83248609, "learning_rate": 2.7458496908630305e-06, "loss": 0.85429847, "num_input_tokens_seen": 141820025, "step": 6603, "time_per_iteration": 2.8909716606140137 }, { "auxiliary_loss_clip": 0.01112282, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.04651403, "balance_loss_mlp": 1.02003431, "epoch": 0.3970539606192695, "flos": 17785945301760.0, "grad_norm": 1.9508498227264648, "language_loss": 0.73302728, "learning_rate": 2.7454883102037563e-06, "loss": 0.75449347, "num_input_tokens_seen": 141838735, "step": 6604, "time_per_iteration": 2.828908920288086 }, { "auxiliary_loss_clip": 0.01105132, "auxiliary_loss_mlp": 0.01038476, "balance_loss_clip": 1.04384422, "balance_loss_mlp": 1.02364659, "epoch": 0.3971140838719375, "flos": 24789495208320.0, "grad_norm": 1.769953580879433, "language_loss": 0.82582277, "learning_rate": 2.745126901275491e-06, "loss": 0.84725887, "num_input_tokens_seen": 141858090, "step": 6605, "time_per_iteration": 2.6773502826690674 }, { "auxiliary_loss_clip": 0.01128613, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.04549098, "balance_loss_mlp": 1.01968801, "epoch": 0.39717420712460544, "flos": 24243581329920.0, "grad_norm": 1.4871941413504006, "language_loss": 0.73511499, "learning_rate": 2.7447654640919383e-06, "loss": 0.75673246, "num_input_tokens_seen": 141877540, "step": 6606, "time_per_iteration": 2.632805347442627 }, { "auxiliary_loss_clip": 0.01089285, "auxiliary_loss_mlp": 0.01048599, "balance_loss_clip": 1.0436089, "balance_loss_mlp": 1.03198171, "epoch": 0.3972343303772734, "flos": 25884698843520.0, "grad_norm": 2.092571399644939, "language_loss": 0.74296981, "learning_rate": 2.744403998666805e-06, "loss": 0.76434863, "num_input_tokens_seen": 141897315, "step": 6607, "time_per_iteration": 2.7277770042419434 }, { "auxiliary_loss_clip": 0.01124169, "auxiliary_loss_mlp": 0.01037393, "balance_loss_clip": 1.04697132, "balance_loss_mlp": 1.02267027, "epoch": 0.39729445362994137, "flos": 45623716300800.0, "grad_norm": 1.5196847129379933, "language_loss": 0.6787042, "learning_rate": 2.744042505013797e-06, "loss": 0.70031989, "num_input_tokens_seen": 141919580, "step": 6608, "time_per_iteration": 2.8229119777679443 }, { "auxiliary_loss_clip": 0.01094928, "auxiliary_loss_mlp": 0.01054175, "balance_loss_clip": 1.04091311, "balance_loss_mlp": 1.03580451, "epoch": 0.39735457688260933, "flos": 20193971120640.0, "grad_norm": 7.314681050409252, "language_loss": 0.74670005, "learning_rate": 2.7436809831466233e-06, "loss": 0.7681911, "num_input_tokens_seen": 141937045, "step": 6609, "time_per_iteration": 2.7502245903015137 }, { "auxiliary_loss_clip": 0.01107217, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.04354, "balance_loss_mlp": 1.02056026, "epoch": 0.3974147001352773, "flos": 23331163029120.0, "grad_norm": 1.742454501323656, "language_loss": 0.713238, "learning_rate": 2.7433194330789927e-06, "loss": 0.73467076, "num_input_tokens_seen": 141956695, "step": 6610, "time_per_iteration": 2.7225286960601807 }, { "auxiliary_loss_clip": 0.01105851, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.03818822, "balance_loss_mlp": 1.01509547, "epoch": 0.39747482338794526, "flos": 21688644885120.0, "grad_norm": 1.7960063460415152, "language_loss": 0.78151029, "learning_rate": 2.7429578548246133e-06, "loss": 0.8028695, "num_input_tokens_seen": 141975935, "step": 6611, "time_per_iteration": 2.6464622020721436 }, { "auxiliary_loss_clip": 0.01121213, "auxiliary_loss_mlp": 0.01038273, "balance_loss_clip": 1.04614162, "balance_loss_mlp": 1.0235095, "epoch": 0.3975349466406133, "flos": 30988717816320.0, "grad_norm": 1.7937788130001704, "language_loss": 0.7921629, "learning_rate": 2.7425962483971985e-06, "loss": 0.81375778, "num_input_tokens_seen": 141995750, "step": 6612, "time_per_iteration": 2.734950304031372 }, { "auxiliary_loss_clip": 0.01018209, "auxiliary_loss_mlp": 0.0100828, "balance_loss_clip": 1.02113628, "balance_loss_mlp": 1.00702214, "epoch": 0.39759506989328125, "flos": 63683948833920.0, "grad_norm": 0.8423760762856193, "language_loss": 0.64935088, "learning_rate": 2.742234613810459e-06, "loss": 0.66961575, "num_input_tokens_seen": 142057655, "step": 6613, "time_per_iteration": 3.1294105052948 }, { "auxiliary_loss_clip": 0.01097901, "auxiliary_loss_mlp": 0.01042526, "balance_loss_clip": 1.03916883, "balance_loss_mlp": 1.02507401, "epoch": 0.3976551931459492, "flos": 23695835857920.0, "grad_norm": 3.0444472336295636, "language_loss": 0.71508956, "learning_rate": 2.741872951078109e-06, "loss": 0.73649383, "num_input_tokens_seen": 142076020, "step": 6614, "time_per_iteration": 2.6479976177215576 }, { "auxiliary_loss_clip": 0.01116106, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.04503131, "balance_loss_mlp": 1.02034712, "epoch": 0.3977153163986172, "flos": 15669657745920.0, "grad_norm": 2.2927333729520885, "language_loss": 0.81362098, "learning_rate": 2.741511260213862e-06, "loss": 0.83513486, "num_input_tokens_seen": 142093790, "step": 6615, "time_per_iteration": 2.6567723751068115 }, { "auxiliary_loss_clip": 0.01094954, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.04491544, "balance_loss_mlp": 1.02023649, "epoch": 0.39777543965128515, "flos": 14064702249600.0, "grad_norm": 2.01024859105405, "language_loss": 0.67510247, "learning_rate": 2.741149541231434e-06, "loss": 0.69639802, "num_input_tokens_seen": 142110545, "step": 6616, "time_per_iteration": 2.6675400733947754 }, { "auxiliary_loss_clip": 0.01133654, "auxiliary_loss_mlp": 0.01043633, "balance_loss_clip": 1.04658771, "balance_loss_mlp": 1.02765918, "epoch": 0.3978355629039531, "flos": 23367468700800.0, "grad_norm": 2.3086733420735785, "language_loss": 0.83678514, "learning_rate": 2.740787794144541e-06, "loss": 0.85855806, "num_input_tokens_seen": 142128695, "step": 6617, "time_per_iteration": 2.5879552364349365 }, { "auxiliary_loss_clip": 0.01126085, "auxiliary_loss_mlp": 0.01039432, "balance_loss_clip": 1.04570735, "balance_loss_mlp": 1.02563334, "epoch": 0.3978956861566211, "flos": 19062785036160.0, "grad_norm": 1.7795732635152253, "language_loss": 0.72766519, "learning_rate": 2.7404260189669e-06, "loss": 0.74932027, "num_input_tokens_seen": 142148375, "step": 6618, "time_per_iteration": 2.613162040710449 }, { "auxiliary_loss_clip": 0.01111951, "auxiliary_loss_mlp": 0.01041983, "balance_loss_clip": 1.04827428, "balance_loss_mlp": 1.02544832, "epoch": 0.39795580940928904, "flos": 30227699341440.0, "grad_norm": 1.6960793445061386, "language_loss": 0.65858316, "learning_rate": 2.740064215712231e-06, "loss": 0.68012249, "num_input_tokens_seen": 142169735, "step": 6619, "time_per_iteration": 2.7474000453948975 }, { "auxiliary_loss_clip": 0.01052495, "auxiliary_loss_mlp": 0.01004058, "balance_loss_clip": 1.0230546, "balance_loss_mlp": 1.00270545, "epoch": 0.398015932661957, "flos": 69847224906240.0, "grad_norm": 0.7704475067145287, "language_loss": 0.58246851, "learning_rate": 2.7397023843942527e-06, "loss": 0.60303402, "num_input_tokens_seen": 142229520, "step": 6620, "time_per_iteration": 3.1400091648101807 }, { "auxiliary_loss_clip": 0.01113547, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.04998422, "balance_loss_mlp": 1.02314794, "epoch": 0.39807605591462497, "flos": 20157773189760.0, "grad_norm": 1.821199996328267, "language_loss": 0.7925806, "learning_rate": 2.739340525026686e-06, "loss": 0.81408358, "num_input_tokens_seen": 142247660, "step": 6621, "time_per_iteration": 2.7389161586761475 }, { "auxiliary_loss_clip": 0.0110802, "auxiliary_loss_mlp": 0.01034956, "balance_loss_clip": 1.04590595, "balance_loss_mlp": 1.02088952, "epoch": 0.39813617916729294, "flos": 21141761339520.0, "grad_norm": 1.899291394170355, "language_loss": 0.77800381, "learning_rate": 2.738978637623252e-06, "loss": 0.79943347, "num_input_tokens_seen": 142266990, "step": 6622, "time_per_iteration": 2.7175779342651367 }, { "auxiliary_loss_clip": 0.01101638, "auxiliary_loss_mlp": 0.01038721, "balance_loss_clip": 1.04108417, "balance_loss_mlp": 1.02377844, "epoch": 0.3981963024199609, "flos": 18988485753600.0, "grad_norm": 1.6278701941081761, "language_loss": 0.7497921, "learning_rate": 2.738616722197674e-06, "loss": 0.77119565, "num_input_tokens_seen": 142287170, "step": 6623, "time_per_iteration": 2.682567596435547 }, { "auxiliary_loss_clip": 0.01088304, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.04280734, "balance_loss_mlp": 1.02590537, "epoch": 0.39825642567262887, "flos": 16575108808320.0, "grad_norm": 2.4968757127264465, "language_loss": 0.79733497, "learning_rate": 2.7382547787636766e-06, "loss": 0.81862563, "num_input_tokens_seen": 142305405, "step": 6624, "time_per_iteration": 2.6878697872161865 }, { "auxiliary_loss_clip": 0.01135858, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.04792297, "balance_loss_mlp": 1.0270462, "epoch": 0.39831654892529683, "flos": 22199833290240.0, "grad_norm": 2.0557211895564884, "language_loss": 0.83616954, "learning_rate": 2.7378928073349832e-06, "loss": 0.85796595, "num_input_tokens_seen": 142322710, "step": 6625, "time_per_iteration": 2.5847036838531494 }, { "auxiliary_loss_clip": 0.011152, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.04585958, "balance_loss_mlp": 1.02948713, "epoch": 0.39837667217796485, "flos": 10487963612160.0, "grad_norm": 2.4120237094780377, "language_loss": 0.87324822, "learning_rate": 2.737530807925321e-06, "loss": 0.89484465, "num_input_tokens_seen": 142338535, "step": 6626, "time_per_iteration": 2.5845320224761963 }, { "auxiliary_loss_clip": 0.01067442, "auxiliary_loss_mlp": 0.00775778, "balance_loss_clip": 1.03995085, "balance_loss_mlp": 1.00066137, "epoch": 0.3984367954306328, "flos": 17965282930560.0, "grad_norm": 2.3324132294494797, "language_loss": 0.83462882, "learning_rate": 2.737168780548417e-06, "loss": 0.85306096, "num_input_tokens_seen": 142354570, "step": 6627, "time_per_iteration": 2.854428291320801 }, { "auxiliary_loss_clip": 0.01087071, "auxiliary_loss_mlp": 0.00771611, "balance_loss_clip": 1.04081798, "balance_loss_mlp": 1.00056684, "epoch": 0.3984969186833008, "flos": 22711057608960.0, "grad_norm": 1.4575889504047923, "language_loss": 0.82904339, "learning_rate": 2.736806725217998e-06, "loss": 0.84763026, "num_input_tokens_seen": 142374395, "step": 6628, "time_per_iteration": 2.772620916366577 }, { "auxiliary_loss_clip": 0.01092039, "auxiliary_loss_mlp": 0.01062711, "balance_loss_clip": 1.04402328, "balance_loss_mlp": 1.04652882, "epoch": 0.39855704193596875, "flos": 23405785534080.0, "grad_norm": 1.6631347026103094, "language_loss": 0.71145642, "learning_rate": 2.7364446419477945e-06, "loss": 0.73300385, "num_input_tokens_seen": 142396040, "step": 6629, "time_per_iteration": 2.681969165802002 }, { "auxiliary_loss_clip": 0.01097676, "auxiliary_loss_mlp": 0.01035809, "balance_loss_clip": 1.04695797, "balance_loss_mlp": 1.02136111, "epoch": 0.3986171651886367, "flos": 21251935330560.0, "grad_norm": 1.757569665448266, "language_loss": 0.80513418, "learning_rate": 2.7360825307515366e-06, "loss": 0.82646906, "num_input_tokens_seen": 142415495, "step": 6630, "time_per_iteration": 2.7747275829315186 }, { "auxiliary_loss_clip": 0.01072778, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.04495096, "balance_loss_mlp": 1.01805389, "epoch": 0.3986772884413047, "flos": 12458705258880.0, "grad_norm": 2.3833222910170857, "language_loss": 0.74846494, "learning_rate": 2.7357203916429555e-06, "loss": 0.76951796, "num_input_tokens_seen": 142431865, "step": 6631, "time_per_iteration": 2.8098866939544678 }, { "auxiliary_loss_clip": 0.01095184, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.04248333, "balance_loss_mlp": 1.02500248, "epoch": 0.39873741169397264, "flos": 19646117907840.0, "grad_norm": 2.096728163981437, "language_loss": 0.7160908, "learning_rate": 2.735358224635783e-06, "loss": 0.73744667, "num_input_tokens_seen": 142450595, "step": 6632, "time_per_iteration": 2.81479811668396 }, { "auxiliary_loss_clip": 0.01063774, "auxiliary_loss_mlp": 0.00771132, "balance_loss_clip": 1.04164338, "balance_loss_mlp": 1.00057721, "epoch": 0.3987975349466406, "flos": 21684766216320.0, "grad_norm": 2.0680050346702945, "language_loss": 0.7479074, "learning_rate": 2.7349960297437533e-06, "loss": 0.76625645, "num_input_tokens_seen": 142466650, "step": 6633, "time_per_iteration": 2.9533073902130127 }, { "auxiliary_loss_clip": 0.01105798, "auxiliary_loss_mlp": 0.01028668, "balance_loss_clip": 1.0465138, "balance_loss_mlp": 1.01509583, "epoch": 0.3988576581993086, "flos": 23914064937600.0, "grad_norm": 1.7626671777587215, "language_loss": 0.81420207, "learning_rate": 2.7346338069806e-06, "loss": 0.83554673, "num_input_tokens_seen": 142486165, "step": 6634, "time_per_iteration": 2.760012626647949 }, { "auxiliary_loss_clip": 0.0110458, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.04739153, "balance_loss_mlp": 1.01618731, "epoch": 0.39891778145197654, "flos": 18149899858560.0, "grad_norm": 2.495702621722643, "language_loss": 0.74914795, "learning_rate": 2.7342715563600597e-06, "loss": 0.77050287, "num_input_tokens_seen": 142505035, "step": 6635, "time_per_iteration": 4.225152015686035 }, { "auxiliary_loss_clip": 0.01101511, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.04791617, "balance_loss_mlp": 1.02265239, "epoch": 0.3989779047046445, "flos": 22595281096320.0, "grad_norm": 28.19463582214486, "language_loss": 0.66373086, "learning_rate": 2.733909277895868e-06, "loss": 0.68513715, "num_input_tokens_seen": 142521870, "step": 6636, "time_per_iteration": 4.455794811248779 }, { "auxiliary_loss_clip": 0.01118899, "auxiliary_loss_mlp": 0.01041724, "balance_loss_clip": 1.04681683, "balance_loss_mlp": 1.02687669, "epoch": 0.39903802795731247, "flos": 18077216688000.0, "grad_norm": 2.0422591411720723, "language_loss": 0.81318372, "learning_rate": 2.733546971601763e-06, "loss": 0.83478993, "num_input_tokens_seen": 142540455, "step": 6637, "time_per_iteration": 4.3843090534210205 }, { "auxiliary_loss_clip": 0.0102804, "auxiliary_loss_mlp": 0.01018728, "balance_loss_clip": 1.02743387, "balance_loss_mlp": 1.01694012, "epoch": 0.39909815120998043, "flos": 70441367771520.0, "grad_norm": 0.719892771757815, "language_loss": 0.53119934, "learning_rate": 2.733184637491484e-06, "loss": 0.55166698, "num_input_tokens_seen": 142599665, "step": 6638, "time_per_iteration": 3.2910361289978027 }, { "auxiliary_loss_clip": 0.01112783, "auxiliary_loss_mlp": 0.00772668, "balance_loss_clip": 1.04786587, "balance_loss_mlp": 1.00065207, "epoch": 0.39915827446264845, "flos": 18549262247040.0, "grad_norm": 1.6115719033099838, "language_loss": 0.75487578, "learning_rate": 2.732822275578769e-06, "loss": 0.77373028, "num_input_tokens_seen": 142618845, "step": 6639, "time_per_iteration": 2.7083969116210938 }, { "auxiliary_loss_clip": 0.0105821, "auxiliary_loss_mlp": 0.01036909, "balance_loss_clip": 1.03856301, "balance_loss_mlp": 1.022264, "epoch": 0.3992183977153164, "flos": 29897249195520.0, "grad_norm": 2.505539025941121, "language_loss": 0.76163709, "learning_rate": 2.7324598858773603e-06, "loss": 0.78258824, "num_input_tokens_seen": 142640885, "step": 6640, "time_per_iteration": 2.8801841735839844 }, { "auxiliary_loss_clip": 0.01102565, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.04663992, "balance_loss_mlp": 1.02430892, "epoch": 0.3992785209679844, "flos": 22565080736640.0, "grad_norm": 2.779199402703341, "language_loss": 0.81995392, "learning_rate": 2.7320974684009996e-06, "loss": 0.84136951, "num_input_tokens_seen": 142659340, "step": 6641, "time_per_iteration": 4.346608638763428 }, { "auxiliary_loss_clip": 0.01136449, "auxiliary_loss_mlp": 0.01038781, "balance_loss_clip": 1.05189252, "balance_loss_mlp": 1.02393353, "epoch": 0.39933864422065235, "flos": 19682674974720.0, "grad_norm": 2.1545130527280985, "language_loss": 0.76744998, "learning_rate": 2.7317350231634288e-06, "loss": 0.78920233, "num_input_tokens_seen": 142677085, "step": 6642, "time_per_iteration": 2.656057596206665 }, { "auxiliary_loss_clip": 0.01106418, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.04871511, "balance_loss_mlp": 1.0196898, "epoch": 0.3993987674733203, "flos": 23038491012480.0, "grad_norm": 2.1744041926742788, "language_loss": 0.72387367, "learning_rate": 2.731372550178393e-06, "loss": 0.7452786, "num_input_tokens_seen": 142694595, "step": 6643, "time_per_iteration": 2.680995225906372 }, { "auxiliary_loss_clip": 0.01123145, "auxiliary_loss_mlp": 0.01040337, "balance_loss_clip": 1.04840899, "balance_loss_mlp": 1.02565074, "epoch": 0.3994588907259883, "flos": 19390828970880.0, "grad_norm": 1.7059817149479597, "language_loss": 0.6665355, "learning_rate": 2.7310100494596375e-06, "loss": 0.68817025, "num_input_tokens_seen": 142714175, "step": 6644, "time_per_iteration": 2.6378324031829834 }, { "auxiliary_loss_clip": 0.01130779, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.04629064, "balance_loss_mlp": 1.02349472, "epoch": 0.39951901397865625, "flos": 13734395758080.0, "grad_norm": 2.1425296608964937, "language_loss": 0.78164649, "learning_rate": 2.730647521020907e-06, "loss": 0.80333817, "num_input_tokens_seen": 142730955, "step": 6645, "time_per_iteration": 2.6268746852874756 }, { "auxiliary_loss_clip": 0.0112116, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.04624033, "balance_loss_mlp": 1.02252507, "epoch": 0.3995791372313242, "flos": 23586451966080.0, "grad_norm": 1.7492924628724136, "language_loss": 0.69861412, "learning_rate": 2.73028496487595e-06, "loss": 0.72019678, "num_input_tokens_seen": 142751200, "step": 6646, "time_per_iteration": 2.7350409030914307 }, { "auxiliary_loss_clip": 0.0107684, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.03799927, "balance_loss_mlp": 1.02223825, "epoch": 0.3996392604839922, "flos": 21355896268800.0, "grad_norm": 1.7623657715359762, "language_loss": 0.72017872, "learning_rate": 2.729922381038513e-06, "loss": 0.74132061, "num_input_tokens_seen": 142770170, "step": 6647, "time_per_iteration": 2.7607529163360596 }, { "auxiliary_loss_clip": 0.01093143, "auxiliary_loss_mlp": 0.01043089, "balance_loss_clip": 1.04529011, "balance_loss_mlp": 1.02973795, "epoch": 0.39969938373666014, "flos": 26032255914240.0, "grad_norm": 1.4563496549616326, "language_loss": 0.74217343, "learning_rate": 2.7295597695223463e-06, "loss": 0.7635358, "num_input_tokens_seen": 142792680, "step": 6648, "time_per_iteration": 2.8048219680786133 }, { "auxiliary_loss_clip": 0.01133606, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.04912674, "balance_loss_mlp": 1.02281022, "epoch": 0.3997595069893281, "flos": 20116367786880.0, "grad_norm": 2.0433040752683578, "language_loss": 0.6589973, "learning_rate": 2.7291971303412006e-06, "loss": 0.6807096, "num_input_tokens_seen": 142810510, "step": 6649, "time_per_iteration": 2.6976583003997803 }, { "auxiliary_loss_clip": 0.01103049, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.04713392, "balance_loss_mlp": 1.02803016, "epoch": 0.39981963024199607, "flos": 27783403764480.0, "grad_norm": 1.7319771659085785, "language_loss": 0.75106388, "learning_rate": 2.728834463508826e-06, "loss": 0.77251565, "num_input_tokens_seen": 142832455, "step": 6650, "time_per_iteration": 2.7441325187683105 }, { "auxiliary_loss_clip": 0.01132922, "auxiliary_loss_mlp": 0.01042591, "balance_loss_clip": 1.04873252, "balance_loss_mlp": 1.02803564, "epoch": 0.39987975349466404, "flos": 21944436612480.0, "grad_norm": 1.5673208577322473, "language_loss": 0.72102094, "learning_rate": 2.728471769038975e-06, "loss": 0.74277604, "num_input_tokens_seen": 142852590, "step": 6651, "time_per_iteration": 2.6027066707611084 }, { "auxiliary_loss_clip": 0.01132958, "auxiliary_loss_mlp": 0.01045235, "balance_loss_clip": 1.04850328, "balance_loss_mlp": 1.03093004, "epoch": 0.39993987674733206, "flos": 20704405340160.0, "grad_norm": 1.8492457158027382, "language_loss": 0.73126423, "learning_rate": 2.728109046945403e-06, "loss": 0.75304615, "num_input_tokens_seen": 142870595, "step": 6652, "time_per_iteration": 2.5880327224731445 }, { "auxiliary_loss_clip": 0.01029168, "auxiliary_loss_mlp": 0.01002764, "balance_loss_clip": 1.02822125, "balance_loss_mlp": 1.00134552, "epoch": 0.4, "flos": 61525429862400.0, "grad_norm": 0.8458278780382239, "language_loss": 0.60614997, "learning_rate": 2.727746297241862e-06, "loss": 0.62646931, "num_input_tokens_seen": 142925805, "step": 6653, "time_per_iteration": 3.1626622676849365 }, { "auxiliary_loss_clip": 0.01093219, "auxiliary_loss_mlp": 0.01039197, "balance_loss_clip": 1.04810715, "balance_loss_mlp": 1.02577376, "epoch": 0.400060123252668, "flos": 14502309644160.0, "grad_norm": 3.0617453661279788, "language_loss": 0.66701174, "learning_rate": 2.7273835199421085e-06, "loss": 0.6883359, "num_input_tokens_seen": 142943145, "step": 6654, "time_per_iteration": 2.696179151535034 }, { "auxiliary_loss_clip": 0.01119303, "auxiliary_loss_mlp": 0.01043738, "balance_loss_clip": 1.04738593, "balance_loss_mlp": 1.03145993, "epoch": 0.40012024650533595, "flos": 19093308618240.0, "grad_norm": 2.461149956206156, "language_loss": 0.89818919, "learning_rate": 2.7270207150599e-06, "loss": 0.91981959, "num_input_tokens_seen": 142956925, "step": 6655, "time_per_iteration": 2.601891279220581 }, { "auxiliary_loss_clip": 0.01100614, "auxiliary_loss_mlp": 0.01040322, "balance_loss_clip": 1.04367936, "balance_loss_mlp": 1.02693462, "epoch": 0.4001803697580039, "flos": 29351012094720.0, "grad_norm": 1.7913118709828861, "language_loss": 0.73551166, "learning_rate": 2.7266578826089917e-06, "loss": 0.75692105, "num_input_tokens_seen": 142978040, "step": 6656, "time_per_iteration": 2.705662727355957 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 142978040, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.5758006388156006e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }